In [286]:
import numpy as np
import pandas as pd
from numpy.testing import assert_equal, assert_allclose

## Problem 1

Create a function `double_work` that accepts a pandas data frame `df` with a time series index and columns `a` and `b`. It modifies `df` such that the value of `a` from 9am to 5pm (inclusive) of 1 Jan 2021 is double the value of `b` for those hours.

In [23]:
def double_work(df):
    """
    Modify DataFrame with time series index and columns a and b
    
    The value of column a from 9am to 5pm (inclusive) of 1 Jan 2021 is double 
    the value of column b for those hours.
    
    Parameters
    ----------
    df : DataFrame
        DataFrame with column a and b with time series index
    
    Returns
    -------
    df: DataFrame
        modified DataFrame
    """
    newdf1 = df.loc[(df.index.hour >= 9) & (df.index.hour <= 17)].copy()

    newdf1['a'] = newdf1['b']*2

    df.update(newdf1)
    return df    

In [24]:
df = pd.DataFrame(
    {
        'a': range(24), 
        'b': np.linspace(1, 100, 24)
    },
        index=pd.date_range('2021-01-01 00:00', '2021-01-01 23:00', freq='H')
)
double_work(df)
assert_equal(
    df.index[:10].tolist(),
    [pd.Timestamp('2021-01-01 00:00:00', freq='H'),
     pd.Timestamp('2021-01-01 01:00:00', freq='H'),
     pd.Timestamp('2021-01-01 02:00:00', freq='H'),
     pd.Timestamp('2021-01-01 03:00:00', freq='H'),
     pd.Timestamp('2021-01-01 04:00:00', freq='H'),
     pd.Timestamp('2021-01-01 05:00:00', freq='H'),
     pd.Timestamp('2021-01-01 06:00:00', freq='H'),
     pd.Timestamp('2021-01-01 07:00:00', freq='H'),
     pd.Timestamp('2021-01-01 08:00:00', freq='H'),
     pd.Timestamp('2021-01-01 09:00:00', freq='H')]
)
assert_allclose(
    df.iloc[:10].values,
    [[ 0.        ,  1.        ],
     [ 1.        ,  5.30434783],
     [ 2.        ,  9.60869565],
     [ 3.        , 13.91304348],
     [ 4.        , 18.2173913 ],
     [ 5.        , 22.52173913],
     [ 6.        , 26.82608696],
     [ 7.        , 31.13043478],
     [ 8.        , 35.43478261],
     [79.47826087, 39.73913043]]
)

## Problem 2

Create a function `hourly_mem_usage` that reads `mem.csv` and returns the mean hourly memory usage in Philippine time (UTC+8) as a pandas time series. The first column of `mem.csv` is the time in UTC and the second column is the memory usage in bytes. Each interval is closed on the left but open on the right.

In [120]:
def hourly_mem_usage():
    """
    Compute mean hourly memory usage in UTC+8 time
    
    Read 'mem.csv' and return the mean hourly memory usage in Philippine
    time (UTC+8) as a pandas time series. Each interval is closed on the 
    left but open on the right. 
    
    
    Returns
    -------
    out2: Series
        contains average hourly memory usage in UTC+8
    """
    df2 = pd.read_csv('mem.csv')
    df2

    df2['Time'] = pd.to_datetime(df2['Time'], utc=True).dt.tz_convert('Asia/Manila')
    
    out2 = df2.groupby(pd.Grouper(key='Time', freq='H', closed='left')).mean()['accesslab']
    return out2

In [122]:
hmu = hourly_mem_usage()
assert_equal(len(hmu), 49)
assert_equal(
    hmu.index[:10].tolist(),
    [pd.Timestamp('2021-05-28 00:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 01:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 02:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 03:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 04:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 05:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 06:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 07:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 08:00:00+0800', tz='Asia/Manila', freq='H'),
     pd.Timestamp('2021-05-28 09:00:00+0800', tz='Asia/Manila', freq='H')]
)
assert_allclose(
    hmu[:10].tolist(),
    [369325498368.0,
     389165290837.3333,
     394183460454.4,
     388326653132.8,
     390690503065.6,
     391548573832.5333,
     389338755072.0,
     389219362406.4,
     390718947328.0,
     375491907037.86664]
)

## Problem 3

Create a function `daily_mem_usage` that reads `mem.csv` and returns the mean daily memory usage as a pandas time series with a daily period index. The first column of `mem.csv` is the time and the second column is the memory usage in bytes. Each interval is closed on the left but open on the right.

In [158]:
def daily_mem_usage():
    """
    Compute mean daily memory usage
    
    Reads mem.csv and returns the mean daily memory usage as a pandas time 
    series with a daily period index. Each interval is closed on the left
    but open on the right.
    
    Returns
    -------
    out3: Series
        contains mean daily memory usage with daily period as indices
    """
    df3 = pd.read_csv('mem.csv')
    df3['Time'] = pd.to_datetime(df3['Time'])

    df3['Time'] = pd.to_datetime(df3['Time'])
    df3['date'] = pd.PeriodIndex(df3.Time, freq='D')

    out3 = df3.groupby(df3.date)['accesslab'].mean()
    return out3

In [159]:
dmu = daily_mem_usage()
assert_equal(
    dmu.index[:2].tolist(),
    [pd.Period('2021-05-27', 'D'), pd.Period('2021-05-28', 'D')]
)
assert_allclose(
    dmu[:2].tolist(),
    [390154853588.5283, 346686658616.8889]
)

## Problem 4

Create a function `longest_distances` that reads the first 1M data lines of `/mnt/data/public/nyctaxi/trip_data/trip_data_1.csv` and returns a `pandas.Series` containing the maximum `trip_distance` for each pickup hour of day and passenger count. The index is a hierarchical index of `pickup_datetime` and `passenger_count`.

In [200]:
def longest_distances():
    """
    Get maximum trip_distance values
    
    Reads the first 1M data lines of filepath '/mnt/data/public/nyctaxi/
    trip_data/trip_data_1.csv' and returns a pandas Series containing the 
    maximum trip_distance for each pickup hour of day and passenger count. 
    The index is a hierarchical index of pickup_datetime and passenger_count.
    
    Returns
    -------
    out4: Series
        contains maximum trip_distance values
    """
    df4 = pd.read_csv('/mnt/data/public/nyctaxi/trip_data/trip_data_1.csv', 
                      nrows=1000000)
    df4.head()
    df4['pickup_datetime'] = pd.to_datetime(df4['pickup_datetime'])
    out4 = (df4.groupby([df4['pickup_datetime'].dt.hour,df4.passenger_count])
            ['trip_distance'].max())
    return out4

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,1.0,-73.978165,40.757977,-73.989838,40.751171
1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,1.5,-74.006683,40.731781,-73.994499,40.75066
2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,1.1,-74.004707,40.73777,-74.009834,40.726002
3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,0.7,-73.974602,40.759945,-73.984734,40.759388
4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,2.1,-73.97625,40.748528,-74.002586,40.747868


In [201]:
distances = longest_distances()
assert_equal(isinstance(distances, pd.core.series.Series), True)
assert_equal(distances.shape, (149,))
assert_equal(distances.index.levshape, (24, 7))
assert_equal(distances.index.names, ['pickup_datetime', 'passenger_count'])

## Problem 5
Create a function `mean_ratings` that reads `/mnt/data/public/insideairbnb/data.insideairbnb.com/united-kingdom/england/london/2015-04-06/data/listings.csv.gz` and returns a `pandas.Series` containing the average `review_scores_ratings` of hosts, binned by monthly `host_since`. Ignore `NA`s or null values when computing the mean.

In [213]:
import datetime

In [247]:
def mean_ratings():
    """
    Compute average review_scores_ratings values
    
    Reads /mnt/data/public/insideairbnb/data.insideairbnb.com/united-kingdom/
    england/london/2015-04-06/data/listings.csv.gz and returns a pandas.Series
    containing the average review_scores_ratings of hosts, binned by monthly 
    host_since. Ignore NAs or null values when computing the mean.

    Returns
    -------
    Series
        contains average review_scores_ratings values
    """
    df5 = pd.read_csv('/mnt/data/public/insideairbnb/data.insideairbnb.com/united'
                      '-kingdom/england/london/2015-04-06/data/listings.csv.gz')
    df5['host_since'] = pd.to_datetime(df5.host_since)

    df5_new = df5[['host_since','review_scores_rating']].copy()
    grpby = df5_new.groupby(pd.Grouper(key = 'host_since', freq='M')).mean()

    return grpby['review_scores_rating']

In [248]:
import datetime
ratings = mean_ratings()
assert_equal(isinstance(ratings, pd.core.series.Series), True)
assert_equal(ratings.shape, (80,))
assert_equal(ratings.index[0], datetime.datetime(2008, 9, 30))
assert_equal(ratings.index[-1], datetime.datetime(2015, 4, 30))

## Problem 6

Create a function `product_aisles` that reads `/mnt/data/public/instacart/instacart_2017_05_01/products.csv` and returns a `pandas.DataFrame`. The data frame should have the contents of that file but with an additional column `aisle` that corresponds to the name of the aisle according to `/mnt/data/public/instacart/instacart_2017_05_01/aisles.csv`. The index should be the `product_id` column.

In [272]:
def product_aisles():
    df6_prod = pd.read_csv('/mnt/data/public/instacart/instacart_2017_05_01'
                      '/products.csv')    
    df6_aisle = pd.read_csv('/mnt/data/public/instacart/'
                            'instacart_2017_05_01/aisles.csv')
    df6_prod = df6_prod.set_index('product_id')

    out6 = df6_prod.merge(df6_aisle, how = 'left', on ='aisle_id')
    return out6

In [273]:
df_products_aisles = product_aisles()
assert_equal(isinstance(df_products_aisles, pd.DataFrame), True)
assert_equal(
    df_products_aisles.columns.tolist(), 
    ['product_name', 'aisle_id', 'department_id', 'aisle'])
assert_equal(df_products_aisles.shape, (49688, 4))

## Problem 7

Create a function `tracks_with_loc` that reads `/mnt/data/public/millionsong/AdditionalFiles/unique_tracks.txt` and returns a `pandas` `DataFrame` that contains the contents of the file but with the artist location from `/mnt/data/public/millionsong/AdditionalFiles/artist_location.txt` matched based on the artist name. Leave the value as null if the artist is not found in `artist_location.txt`. Use the first entry in the file if an artist has more than one location. Sort the data frame by increasing track ID.

In [323]:
def tracks_with_loc():
    """
    Perform left join on DataFrames from two text files.
    
    
    Reads /mnt/data/public/millionsong/AdditionalFiles/unique_tracks.txt and 
    returns a pandas DataFrame that contains the contents of the file but with
    the artist location from /mnt/data/public/millionsong/AdditionalFiles/
    artist_location.txt matched based on the artist name. Leaves the value as 
    null if the artist is not found in artist_location.txt. Uses the first 
    entry in the file if an artist has more than one location. Sorts the data 
    frame by increasing track ID.
    
    Returns
    -------
    out7: DataFrame
        resulting dataframe from left joining dataframes from the two text 
        files
    """
    df7_UT = pd.read_csv('/mnt/data/public/millionsong/AdditionalFiles/'
                      'unique_tracks.txt', sep='<SEP>', engine='python',
                        names=['track_id','song_id','artist','title'])
    df7_AL = pd.read_csv('/mnt/data/public/millionsong/'
                         'AdditionalFiles/artist_location.txt',
                         sep='<SEP>', engine='python',
                         names=['artist_id','lat','lon','artist','location'])

    df7_AL.drop_duplicates(subset='artist', keep='first', inplace = True)
    out7 = df7_UT.merge(df7_AL, left_on='artist', right_on='artist', how='left').sort_values(by='track_id')
    return out7

In [324]:
df_tracks_loc = tracks_with_loc()
assert_equal(isinstance(df_tracks_loc, pd.DataFrame), True)
assert_equal(df_tracks_loc.shape, (1000000, 8))
assert_equal(
    df_tracks_loc.columns.tolist(), 
    ['track_id', 'song_id', 'artist', 'title', 
     'artist_id', 'lat', 'lon', 'location'])

## Problem 8
Create a function `party_votes` that reads `/mnt/data/public/elections/comelec/congress_results/congressional_results_2013.csv` and returns a `pandas` `DataFrame` where the index is the `province_or_city`, the columns are the `party_affiliation` and the values are the total number of votes as integer.

In [360]:
def party_votes():
    """
    Perform pivot table on the csv file read
    
    
    Reads /mnt/data/public/elections/comelec/congress_results/congressional_
    results_2013.csv and returns a pandas DataFrame where the index is the 
    province_or_city, the columns are the party_affiliation and the values are
    the total number of votes as integer.
    
    Returns
    -------
    piv8: DataFrame
        resulting dataframe from pivot table operation
    """
    df8 = pd.read_csv('/mnt/data/public/elections/comelec/congress_results/'
                      'congressional_results_2013.csv')
    
    piv8 = pd.pivot_table(df8, values='votes_obtained', index='province_or_city', columns='party_affiliation')
    piv8 = piv8.fillna(0).astype('int64')
    
    return piv8

Unnamed: 0,province_or_city,position,name,nickname,party_affiliation,votes_obtained
0,Abra,Representative-Lone District,"Bernos, Maria Jocelyn Valera",Joy,Liberal Party,78447
1,Agusan del Norte,Representative-1st District,"Fortun, Lawrence Lemuel Hernandez",Law,Liberal Party,94483
2,Agusan del Norte,Representative-2nd District,"Amante, ErLiberal Partye John Malbas",Ping,KUSGAN,102240
3,Agusan del Sur,Representative-1st District,"Plaza, Maria Valentina Galido",Tina,National Unity Party,74537
4,Agusan del Sur,Representative-2nd District,"Mellana, Evelyn Plaza",Bebs,National Unity Party,64027


In [361]:
df_votes = party_votes()
assert_equal(isinstance(df_votes, pd.DataFrame), True)
assert_equal(df_votes.shape, (108, 24))
assert_equal((df_votes.dtypes == int).all(), True)
assert_equal(df_votes.index[0], 'Abra')
assert_equal(df_votes.columns[0], 'BPP')
assert_equal(df_votes.iloc[0,0], 0)

## Problem 9
Create a function `naia_traffic` that reads `/mnt/data/public/opendata/transport/caap-aircraft/airdata_aircraft_movement_2016.csv` and returns the number of passengers per month, as integer, for every `airline_operator` in `NAIA` as a `pandas` `DataFrame`. The months should be in title case.

In [398]:
def naia_traffic():
    """
    Perform pandas melt function on specified csv file
    
    
    Reads /mnt/data/public/opendata/transport/caap-aircraft/
    airdata_aircraft_movement_2016.csv and returns the number of passengers 
    per month, as integer, for every airline_operator in NAIA as a pandas 
    DataFrame. The months are in title case.
    
    Returns
    -------
    melted: DataFrame
        resulting dataframe from applying pandas melt function
    """
    df9 = pd.read_csv('/mnt/data/public/opendata/transport/caap-aircraft/'
                      'airdata_aircraft_movement_2016.csv')
    df9_naia = df9[df9['airport']== 'NAIA']
    months = df9_naia.columns[3:-2]
    melted = pd.melt(df9_naia, id_vars=['airline_operator'], value_vars=months, var_name='month', value_name='passengers')

    melted['month'] = melted['month'].str.title()
    melted['passengers'] = melted['passengers'].astype('int64')

    return melted


In [399]:
df_naia = naia_traffic()
assert_equal(isinstance(df_naia, pd.DataFrame), True)
assert_equal(
    df_naia.columns.tolist(), 
    ['airline_operator', 'month', 'passengers'])
assert_equal(df_naia.shape, (36, 3))
df_naia_list = df_naia.to_numpy().tolist()
assert_equal(['Domestic', 'April', 13517] in df_naia_list, True)
assert_equal(['G. Aviation', 'April', 3586] in df_naia_list, True)
assert_equal(['International', 'April', 8587] in df_naia_list, True)

## Problem 10

Create a function `pudo` that reads the first 1M data lines of `/mnt/data/public/nyctaxi/all/yellow_tripdata_2017-12.csv` and returns a `pandas` `DataFrame` where the index is the unique values of `PULocationID`, the columns are the unique values of `DOLocationID`, and the values are the number of times the `PULocationID`-`DOLocationID` pair occurred.

In [432]:
def pudo():
    """
    Display the count of unique value pairs as dataframe values
    
    
    Reads the first 1M data lines of /mnt/data/public/nyctaxi/all/
    yellow_tripdata_2017-12.csv and returns a pandas DataFrame where the index
    is the unique values of PULocationID, the columns are the unique values of
    DOLocationID, and the values are the number of times the 
    PULocationID-DOLocationID pair occurred.
    
    Returns
    -------
    pivot10: DataFrame
        resulting dataframe
    """
    df10 = pd.read_csv('/mnt/data/public/nyctaxi/all/yellow_tripdata_2017-12.csv', nrows=1000000)
    df10.head()
    pivot10 = pd.pivot_table(df10, index=['PULocationID'], values='RatecodeID', columns=['DOLocationID'], aggfunc=len, fill_value=0)

    return pivot10

In [433]:
df_pudo = pudo()
assert_equal(df_pudo.shape, (243, 261))
assert_equal(df_pudo.index[:10].tolist(), [1, 3, 4, 5, 6, 7, 8, 9, 10, 11])
nonzeros = df_pudo.to_numpy().nonzero()
assert_equal(nonzeros[0][:10], [0, 0, 0, 1, 1, 1, 1, 1, 1, 2])
assert_equal(nonzeros[1][:10], [0, 128, 260, 2, 31, 68, 133, 155, 164, 0])
assert_equal(
    df_pudo.iloc[0][:10].to_numpy(), 
    [86,  0,  0,  0,  0,  0,  0,  0,  0,  0])