In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import squareform, pdist
%matplotlib inline

  import pandas.util.testing as tm


In [2]:
def preprocessing(df, name):
    '''
    preprocessing data in dataframe
    '''
    df['created_at'] = pd.to_datetime(df['created_at'])
    df = df.sort_values(by='created_at',ascending=False)
    df = df.set_index('created_at')
    df['field1'].fillna(df['field3'])
    df.drop(['entry_id','field2','field3', 'field4', 'field5', 'field6', 'field7', 'field8'], axis=1, inplace=True)
    df['field1'] = pd.to_numeric(df['field1'],errors='coerce')
    df = df.rename(columns={'field1':name})
    hourly_df = df.resample('H').mean().round(2)
    hourly_df.dropna(inplace=True)
    return hourly_df

In [3]:
def merge_dfs(df_list):
    '''
    Merges several dataframes into one based on the index
    '''
    from functools import reduce
    final_df = reduce(lambda left,right: pd.merge(left,right,on='created_at'), df_list)
    return final_df

In [4]:
def heat_map(df, title):
    import seaborn as sns
    
    plt.figure(figsize=(9,9))
    plt.title(title)
    sns.heatmap(df, cmap='OrRd', linewidth=1)
    plt.show()
    plt.savefig('COD.png')

In [5]:
def compute_completeness(df):
    total_hours = 4368
    actual_hours = df.shape[0]
    return round(actual_hours/total_hours*100, 2)

In [6]:
kampala_data = pd.read_csv('./data/completeness_table_above_75_10_09_2020.csv')
kampala_data.head()

Unnamed: 0,Parish,Latitude,Longitude,Mean PM 2.5,Median PM 2.5,25th Percentile,75th Percentile,No. of hourly measurements,6-Month Completeness (%)
0,Nakasero II(2),0.32232,32.5757,37.7,35.9,19.7,48.4,4349,99.57
1,Kansanga,0.29875,32.615,32.7,30.6,15.6,43.8,4311,98.7
2,Nansana East,0.3759,32.528,62.4,51.7,36.0,79.8,4093,93.7
3,Lubaga,0.295314,32.553682,52.1,45.6,32.1,63.8,4055,92.83
4,Nansana West,0.363,32.529,57.7,48.6,36.7,70.3,4053,92.79


In [7]:
import os
kampala_df_list = []
for parish in kampala_data.Parish:
    for f in os.listdir('data/'):
        if parish+'.csv' in f:
            filepath = 'data/'+f
    print(filepath)
    location_df = pd.read_csv(filepath)
    preprocessed_df = preprocessing(location_df, parish.lower().replace(' ', '_'))
    kampala_df_list.append(preprocessed_df)
    print('DONE!!!')

data/AQ_66-Nakasero II(2).csv


  interactivity=interactivity, compiler=compiler, result=result)


DONE!!!
data/aq_46-Kansanga.csv
DONE!!!
data/aq_58-Nansana East.csv
DONE!!!
data/aq_39-Lubaga.csv
DONE!!!
data/aq_63-Nansana West.csv
DONE!!!
data/AQ_49-Lukuli(2).csv
DONE!!!
data/aq_29-Bugolobi.csv
DONE!!!
data/aq_59-Kyaliwajjala.csv
DONE!!!
data/AQ_32-Seguku.csv
DONE!!!
data/AQ_30-Kiwafu.csv
DONE!!!
data/aq_61-Kiwatule.csv
DONE!!!
data/aq_26-Civic Centre.csv
DONE!!!
data/aq_43-Makindye I.csv
DONE!!!


In [8]:
kampala_merged_df = merge_dfs(kampala_df_list)
kampala_merged_df.head()

Unnamed: 0_level_0,nakasero_ii(2),kansanga,nansana_east,lubaga,nansana_west,lukuli(2),bugolobi,kyaliwajjala,seguku,kiwafu,kiwatule,civic_centre,makindye_i
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-02-14 00:00:00+00:00,49.48,54.06,91.2,48.35,81.45,47.23,60.42,71.84,54.9,67.69,52.25,44.12,50.98
2020-02-14 01:00:00+00:00,41.36,41.99,88.5,47.21,71.8,37.67,44.94,50.6,44.96,62.32,40.87,37.35,43.32
2020-02-14 02:00:00+00:00,38.66,34.01,79.04,55.33,85.97,31.02,38.04,44.08,50.87,55.89,33.82,37.13,57.33
2020-02-14 03:00:00+00:00,36.88,37.33,85.47,104.5,242.98,31.71,36.69,49.85,55.17,54.8,38.37,38.78,43.54
2020-02-14 04:00:00+00:00,34.72,37.95,252.73,88.82,103.63,35.25,32.7,54.73,47.46,56.34,82.25,43.27,39.89


In [9]:
kampala_merged_df.shape

(2753, 13)

In [10]:
from scipy.stats.stats import pearsonr
help(pearsonr)

Help on function pearsonr in module scipy.stats.stats:

pearsonr(x, y)
    Pearson correlation coefficient and p-value for testing non-correlation.
    
    The Pearson correlation coefficient [1]_ measures the linear relationship
    between two datasets.  The calculation of the p-value relies on the
    assumption that each dataset is normally distributed.  (See Kowalski [3]_
    for a discussion of the effects of non-normality of the input on the
    distribution of the correlation coefficient.)  Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear relationship.
    Positive correlations imply that as x increases, so does y. Negative
    correlations imply that as x increases, y decreases.
    
    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these da

In [None]:
#Sample pearson implementation
import math

def average(x):
    assert len(x) > 0
    return float(sum(x)) / len(x)

def pearson_def(x, y):
    assert len(x) == len(y)
    n = len(x)
    assert n > 0
    avg_x = average(x)
    avg_y = average(y)
    diffprod = 0
    xdiff2 = 0
    ydiff2 = 0
    for idx in range(n):
        xdiff = x[idx] - avg_x
        ydiff = y[idx] - avg_y
        diffprod += xdiff * ydiff
        xdiff2 += xdiff * xdiff
        ydiff2 += ydiff * ydiff

    return diffprod / math.sqrt(xdiff2 * ydiff2)

print pearson_def([1,2,3], [1,5,7])