In [51]:
import numpy as np
import pandas as pd
from numpy.testing import assert_equal, assert_array_equal, assert_allclose
from pandas.testing import assert_series_equal, assert_frame_equal

## Problem 1
Create a function `init_series` that will return the `pandas.Series` `s1`, `s2` and `s3` shown below. Create another function `init_df` that takes in `s1` and `s2` and returns `df1` and `df2` below.

<img src="series.png" style="width: 15em; display: inline" /> <img src="df.png" style="width: 10em; display: inline" />

In [52]:
def init_series():
    '''reads the data from /mnt/data/public/retaildata/Online Retail.csv and 
    returns the sum of the first 5 rows with the corresponding row of the last
    5 rows for columns Quantity and UnitPrice as a numpy array.
    
    Returns
    -------------
    s1, s2, s3: pandas Series 
        
    
    '''
    s1 = pd.Series(np.arange(1,101,11, dtype = 'float64'), index=list('abcdefghij'), name ='foo')
    s2 = pd.Series(np.arange(0,10, dtype = 'int64'), index=list('fghijklmno'))
    s3 = pd.Series([5]*3, index=list('xyz'), dtype = 'int64')
    return s1, s2, s3


def init_df(s1, s2):
    '''reads the data from /mnt/data/public/retaildata/Online Retail.csv and 
    returns the sum of the first 5 rows with the corresponding row of the last
    5 rows for columns Quantity and UnitPrice as a numpy array.
    
    Parameters
    -------------
    s1: pandas Series
        pandas Series from init_series function
        
    s2: pandas Series 
        pandas Series from init_series function
        
    Returns
    -------------
    df1: pandas DataFrame
    
    df2: pandas DataFrame
    
    '''
    df1 = pd.DataFrame(s1)
    df2 = pd.concat([s2,s1], axis = 1, sort = True)
    df2.columns = ['Series2', 'Series1']
    
    return df1, df2

In [53]:
s1, s2, s3 = init_series()
df1, df2 = init_df(s1, s2)
assert_equal(isinstance(s1, pd.Series), True)
assert_equal(isinstance(s2, pd.Series), True)
assert_equal(isinstance(s3, pd.Series), True)
assert_equal(isinstance(df1, pd.DataFrame), True)
assert_equal(isinstance(df2, pd.DataFrame), True)

## Problem 2
Create a function `top_plus_bottom` that reads the data from `/mnt/data/public/retaildata/Online Retail.csv` and returns the sum of the first 5 rows with the corresponding row of the last 5 rows for columns `Quantity` and `UnitPrice` as a numpy array.

In [54]:
def top_plus_bottom():
    '''reads the data from /mnt/data/public/retaildata/Online Retail.csv and 
    returns the sum of the first 5 rows with the corresponding row of the last
    5 rows for columns Quantity and UnitPrice as a numpy array.
    
    Returns
    -------------
    sum_arr: numpy array 
        sum of the first 5 rows with the corresponding row of the last 5 rows 
        for columns Quantity and UnitPrice
    
    '''
    df = pd.read_csv('/mnt/data/public/retaildata/Online Retail.csv')
    s_head = df[['Quantity','UnitPrice']].head()
    s_tail = df[['Quantity','UnitPrice']].tail()
    sum_arr = np.array(s_head) + np.array(s_tail)
    return sum_arr

In [55]:
tb_sum = top_plus_bottom()
assert_equal(isinstance(tb_sum, np.ndarray), True)
assert_equal(tb_sum.shape, (5, 2))

## Problem 3
Create a function `count_alone` that reads `/mnt/data/public/elections/comelec/voters_profile/philippine_2016_voter_profile_by_age_group.csv` and returns a `pandas` `DataFrame` with an additional column `alone` that is equal to the sum of the `single`, `widow` and `legally_seperated` (sic) columns. The dataframe should be sorted in decreasing order based on `alone`.

In [56]:
def count_alone():
    '''reads /mnt/data/public/elections/comelec/voters_profile/
    philippine_2016_voter_profile_by_age_group.csv and returns a pandas 
    DataFrame with an additional column alone that is equal to the sum of the 
    single, widow and legally_seperated (sic) columns. The dataframe is sorted
    in decreasing order based on alone.
    
    Returns
    -------------
    output: pandas DataFrame 
        DataFrame with an additional column alone that is equal to the sum of 
        the single, widow and legally_seperated (sic) columns. The dataframe 
        is sorted in decreasing order based on alone.
    
    '''
    df3 = pd.read_csv('/mnt/data/public/elections/comelec/voters_profile' +
    '/philippine_2016_voter_profile_by_age_group.csv')

    SWL = df3[['single','widow','legally_seperated']].head()
    sum_SWL = 0
    for i in SWL:
        sum_SWL = sum_SWL + df3[i]

#   sum_SWL = df3['single'] + df3['widow'] + df3['legally_seperated']
#   commented line of code is alternative 

    df3['alone'] = sum_SWL
    return df3

count_alone()[['alone']] #check values if correct

Unnamed: 0,alone
0,2988048
1,7427782
2,5914888
3,3797520
4,2435669
5,1454999
6,968219
7,675589
8,524783
9,427863


In [57]:
df_alone = count_alone()
assert_array_equal(
    df_alone.columns,
    ['age_group', 'registered_voter', 'male', 'female', 'literacy',
     'indigenous_people', 'person_with_disability', 'single', 'married',
     'widow', 'legally_seperated', 'alone'])

## Problem 4
Create a function `scores_stats` that reads `/mnt/data/public/movielens/20m/ml-20m/genome-scores.csv` and returns a `pandas` `Series` that contains the count, mean, standard deviation and quartiles of the `relevance` column.

In [58]:
def scores_stats():
    '''reads /mnt/data/public/movielens/20m/ml-20m/genome-scores.csv and 
    returns a pandas Series that contains the count, mean, standard deviation 
    and quartiles of the relevance column. 
    
    Returns
    -------------
    output: pandas Series 
        It contains the count, mean, standard deviation and quartiles of the 
        relevance column.
    
    '''
    df4 = pd.read_csv('/mnt/data/public/movielens/20m'
                      +'/ml-20m/genome-scores.csv')
    S_rel = df4['relevance']
    S_quan = S_rel.quantile([0.25,0.5,0.75])

    output = pd.Series([S_rel.count(),np.mean(S_rel),np.std(S_rel),\
                        np.min(S_rel)])
    output = output.append(S_quan)
    output = output.append(pd.Series([np.max(S_rel)]))
    output.index = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    
    return output

In [59]:
stats = scores_stats()
assert_equal(isinstance(stats, pd.Series), True)
assert_array_equal(
    stats.index, ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

## Problem 5
Write a function `top_cat` that reads `/mnt/data/public/agora/Agora.csv` and returns a `pandas` `Series` that contains the top 10 most popular categories along with their counts, sorted from most popular to least popular. The index of the series should be the name of the category and the value is the corresponding count. The file follows `latin1` encoding.

In [60]:
def top_cat():
    '''reads /mnt/data/public/agora/Agora.csv and returns a pandas Series that 
    contains the top 10 most popular categories along with their counts, 
    sorted from most popular to least popular. The index of the series is 
    the name of the category and the value is the corresponding count. 
    The file follows latin1 encoding. 
    
    Returns
    -------------
    output: pandas DataFrame 
        DataFrame that contains the top 10 most popular categories along with 
        their counts, sorted from most popular to least popular. The index of 
        the series is the name of the category and the value is the 
        corresponding count. The file follows latin1 encoding. 
    
    '''
    df5 = pd.read_csv('/mnt/data/public/agora/Agora.csv', encoding = 'latin_1')
    output = df5[' Category'].value_counts().head(10)
    return output
   

In [61]:
top_cats = top_cat()
assert_equal(isinstance(top_cats, pd.core.series.Series), True)

## Problem 6
Create a function `listing_info` that reads `/mnt/data/public/insideairbnb/data.insideairbnb.com/united-kingdom/england/london/2015-04-06/data/listings.csv.gz` and returns a `pandas` `DataFrame`.  The index of the data frame should be the `id` with the rows sorted by increasing order of `id`. The columns are `name`, `summary`, `space` and `description`. Include only rows for IDs from 11076 to 15400 (inclusive).

In [62]:
def listing_info():
    '''reads /mnt/data/public/insideairbnb/data.insideairbnb.com/
    united-kingdom/england/london/2015-04-06/data/listings.csv.gz and returns 
    a pandas DataFrame. The index of the data frame should be the id with the 
    rows sorted by increasing order of id. The columns are name, summary, 
    space and description. Include rows for IDs from 11076 to 15400 
    (inclusive).
      
    Returns
    -------------
    output: pandas DataFrame
        DataFrame sorted by ID as index. Only contains IDs from 11076 to 15400
        for columns name, summary, space, and description
    '''
    
    df6 = pd.read_csv('/mnt/data/public/insideairbnb/data.insideairbnb.com/united'
                  '-kingdom/england/london/2015-04-06/data/listings.csv.gz')
    output = df6[['id','name','summary','space','description']]
    output = output[(11076<= output['id']) & (output['id']<=15400)].sort_values(by = 'id')
    output.index = output['id']
    del output['id']
    
    return output


In [63]:
listing_info()

Unnamed: 0_level_0,name,summary,space,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11076,The Sanctury,The room has a double bed and a single foldawa...,This Listing is for The Sanctury The accommoda...,The room has a double bed and a single foldawa...
11551,The bright London flat,This flat is located in the trendy neighbor of...,Important!! If you are two travelers or three ...,This flat is located in the trendy neighbor of...
11816,Modern Two Bedroom in Knightsbridge,Its all about the (email hidden)tuated at Park...,"Situated at London's most exclusive address, K...",Its all about the (email hidden)tuated at Park...
13913,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,"Hello Everyone, I'm offering my lovely double ...",My bright double bedroom with a large window h...
15400,Bright Chelsea Apartment. Chelsea!,,Bright Chelsea Apartment This is a bright one...,Bright Chelsea Apartment This is a bright one...


In [64]:
df_listing = listing_info()
assert_equal(isinstance(df_listing, pd.DataFrame), True)
assert_array_equal(
    df_listing.columns, 
    ['name', 'summary', 'space', 'description'])

## Problem 7
Create a function `aisle_dep` that reads `/mnt/data/public/instacart/instacart_2017_05_01/products.csv` and returns a `pandas` `DataFrame`. The index of the data frame should be `product_id`. For products that are found in aisle 5, append the `aisle_id` and `department_id` following the format (`aisle_id`-`department_id`) to the `product_name`. For example, if product `foo` is in aisle 5 and department 6, then the product name should be `foo (5-6)`. Products in the other aisles should be unmodified.

In [65]:
def aisle_dep():
    '''reads /mnt/data/public/instacart/instacart_2017_05_01/products.csv and 
    returns a pandas DataFrame. The index of the data frame is be product_id.
    The product name contains the product name and the aisle and department id
    if the product is found in aisle 5. 
    
    Returns
    -------------
    output: pandas DataFrame 
        DataFrame with index from product_id column values. The aisle and
        departmend id of products from aisle 5 are appended with the 
        corresponding values of its product_name column values
    
    '''
    df7 = pd.read_csv('/mnt/data/public/instacart/instacart_'
                      +'2017_05_01/products.csv')
    df7.head()
    new_S_prod_name = df7['product_name'].astype(str) + ' ' + df7['aisle_id'].astype(str)\
    + '-' + df7['department_id'].astype(str)
    new_S_prod_name.name = 'product_name'
    new_df7 = pd.DataFrame(new_S_prod_name)
    new_df7

    df7.update(new_df7)
    df7.index = df7['product_id']
    del df7['product_id']
    final_df7 = df7

    return final_df7

In [66]:
df_aisle_dep = aisle_dep()
assert_equal(isinstance(df_aisle_dep, pd.DataFrame), True)
assert_array_equal(
    df_aisle_dep.columns, ['product_name', 'aisle_id', 'department_id'])
assert_array_equal(
    df_aisle_dep.index, np.arange(1, 49689))

## Problem 8
Create a function `camsur_reps` that reads `/mnt/data/public/elections/comelec/congress_results/congressional_results_2013.csv` and returns a `pandas` `DataFrame` with two columns: `surname` and `votes_obtained`. The `surname` is derived from the `name` column. The data frame should only contain those with `province_or_city` equal to `Camarines Sur`.

In [67]:
def camsur_reps():
    ''' reads /mnt/data/public/elections/comelec/congress_results/
    congressional_results_2013.csv and returns a pandas DataFrame with two 
    columns: surname and votes_obtained. The surname is derived from the name 
    column. The data frame should only contain those with province_or_city 
    equal to Camarines Sur.
    
    Returns
    -------------
    output: pandas DataFrame 
        DataFrame with two columns: surname and votes_obtained. 
        The surname is derived from the name column. The data frame should 
        only contain those with province_or_city equal to Camarines Sur.
    
    '''
    df8 = pd.read_csv('/mnt/data/public/elections/comelec/'
                      +'congress_results/congressional_results_2013.csv')

    name_list = df8.name.str.split(',').to_list()
    surname_series = pd.Series([i[0] for i in name_list], name = 'surname')
    surname_series

    df8['surname'] = surname_series
    output = df8[df8['province_or_city'] == 'Camarines Sur']\
        [['surname','votes_obtained']]
    return output

In [68]:
df_camsur = camsur_reps()
assert_equal(isinstance(df_camsur, pd.DataFrame), True)
assert_array_equal(df_camsur.columns, ['surname', 'votes_obtained'])

## Problem 9
Create a function `no_pop` that reads `/mnt/data/public/millionsong/AdditionalFiles/tracks_per_year.txt` and returns a `pandas` `DataFrame` with columns `year`, `track_id`, `artist` and `title`. It should not include songs from the year 2000 or later, or by artists named `Britney Spears` or `Backstreet Boys`.

In [69]:
def no_pop():
    ''' reads /mnt/data/public/millionsong/AdditionalFiles/tracks_per_year.txt
    and returns a pandas DataFrame with columns year, track_id, artist and 
    title. It does not include songs from the year 2000 or later, or by 
    artists named Britney Spears or Backstreet Boys.
    
    Returns
    -------------
    output: pandas DataFrame 
        DataFrame which contains columns year, track_id, artist and title. It 
        excludes songs from the year 2000 or later, or by artists named 
        Britney Spears or Backstreet Boys.
    
    '''
    with open('/mnt/data/public/millionsong/AdditionalFiles'
              +'/tracks_per_year.txt','r') as f:
        text = f.readlines()

    #len(text)  515576 lines

    li = [i.split('<SEP>') for i in text]
    
    year_S = pd.Series([i[0] for i in li])
    
    trackid_S = pd.Series([i[1] for i in li])
    
    artist_S = pd.Series([i[2] for i in li])
    
    title_S = pd.Series([i[3][:-1] for i in li])  
    
    df_cols = [('year',year_S), ('track_id',trackid_S), \
               ('artist', artist_S), ('title', title_S)]
    
    df9 = pd.DataFrame()

    for col in df_cols:
        df9[col[0]] = col[1]

    output = df9[df9['year'].astype(int) < 2000]
    output = output[(output['artist'] != 'Britney Spears') & (output['artist'] != 'Backstreet Boys')]
  
    

#     output = df9[(df9['year'].astype(int) > 2000) & \
#                  ~(df9['artist'] == 'Britney Spears') & ~(df9['artist'] \
#                                                         == 'Backstreet Boys')]
    return output
     

In [70]:
# rows with years 2000 and later = 226354
# rows of df9 == 515576
# rows with britney spears = 145
# rows with backstreet boys = 105
no_pop()

Unnamed: 0,year,track_id,artist,title
0,1922,TRSGHLU128F421DF83,Alberta Hunter,Don't Pan Me
1,1922,TRMYDFV128F42511FC,Barrington Levy,Warm And Sunny Day
2,1922,TRRAHXQ128F42511FF,Barrington Levy,Looking My Love
3,1922,TRFAFTK12903CC77B8,Barrington Levy,Warm And Sunny Day
4,1922,TRSTBUY128F4251203,Barrington Levy,Mandela You're Free
...,...,...,...,...
207056,1999,TRZTTVE128F42A9922,üNN,There's Some Truth It
207057,1999,TRPJQXK128F42A993C,üNN,Blue
207058,1999,TRTYTEL128F42A9956,üNN,Arrival
207059,1999,TRUFRVS128F42A9965,üNN,As Cs Ds


In [71]:
df_nopop = no_pop()
assert_equal(isinstance(df_nopop, pd.DataFrame), True)
assert_array_equal(df_nopop.columns, ['year', 'track_id', 'artist', 'title'])
assert_equal(df_nopop.shape, (207001, 4))

## Problem 10
Create the functions `read_trips` and `write_trips`. The `read_trips` function should read `/mnt/data/public/nyctaxi/trip_data/trip_data_1.csv` and return a `pandas` `DataFrame` which contains the contents of the first 100 data rows of the CSV file. The type of `rate_code` should be `str`, and `pickup_datetime` and `dropoff_datetime` should be `datetime`. The `write_trips` function should accept the output of `read_trips` and save the columns `pickup_longitude`,	`pickup_latitude`, `dropoff_longitude`, and `dropoff_latitude` to a CSV file named `trip_coords.csv`.

In [72]:
def read_trips():
    '''reads /mnt/data/public/nyctaxi/trip_data/trip_data_1.csv and returns a 
    pandas DataFrame which contains the contents of the first 100 data rows 
    of the CSV file. The type of rate_code should be str, and pickup_datetime
    and dropoff_datetime should be datetime.
    
    Returns
    -------------
    df_trips_10: pandas DataFrame 
        DataFrame which contains the contents of the first 100 data rows 
        of the CSV file. The type of rate_code is object, and 
        pickup_datetime and dropoff_datetime are datetime.
    
    
    '''
    df_trips_10 = pd.read_csv('/mnt/data/public/nyctaxi/trip_data/trip_data_1.csv').head(100)
    df_trips_10['rate_code'] = df_trips_10['rate_code'].astype(str)
    df_trips_10['pickup_datetime'] = pd.to_datetime(df_trips_10['pickup_datetime'])
    df_trips_10['dropoff_datetime'] = pd.to_datetime(df_trips_10['pickup_datetime'])
    return df_trips_10

def write_trips(df_trips):
    '''accepts the output of read_trips and save the columns pickup_longitude,
    pickup_latitude, dropoff_longitude, and dropoff_latitude to a CSV file 
    named trip_coords.csv.
    
    Parameters
    -------------
    df_trips: pandas DataFrame
        output of read_trips function     
    '''
    df_write_trips = df_trips[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]
    df_write_trips.to_csv('trip_coords.csv')



In [73]:
!rm -f trip_coords.csv
df_trips = read_trips()
write_trips(df_trips)
assert_equal(isinstance(df_trips, pd.DataFrame), True)
assert_array_equal(df_trips.shape, (100, 14))
assert_array_equal(
    df_trips.columns, 
    ['medallion', 'hack_license', 'vendor_id', 'rate_code', 
     'store_and_fwd_flag', 'pickup_datetime', 'dropoff_datetime',
     'passenger_count', 'trip_time_in_secs', 'trip_distance', 
     'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
     'dropoff_latitude'])