In [223]:
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import pandas as pd
import os
%matplotlib inline

In [224]:
path = os.getcwd()
income = pd.read_csv(path + '/output/income.csv')
population = pd.read_csv(path + '/output/population.csv')
age_sex = pd.read_csv(path + '/output/age_sex.csv')
zipcode = open(path + '/output/zipcode.out','r')

In [225]:
zipc = []
complaints = []
for line in zipcode.readlines():
    zipc.append(line.split('\t')[0].strip())
    complaints.append(int(line.split('\t')[1].strip()))
    
zipc = zipc[1:101]
zipc = [int(x) for x in zipc]

complaint = pd.DataFrame({'Zipcode': np.asarray(zipc), 'Complaints': np.asarray(complaints[1:101])}, index = np.arange(100))
f1 = pd.merge(complaint, age_sex, how='left', on=['Zipcode'])
f2 = pd.merge(f1, population, how='left', on=['Zipcode'])
f3 = pd.merge(f2, income, how='left', on=['Zipcode'])
f3['Median age'] = np.asarray([float(i) for i in f3['Median age']])
f3['Sex ratio (males per 100 females)'] = np.asarray([float(i) for i in f3['Sex ratio (males per 100 females)']])
f3['Mean income'] = np.asarray([float(i) for i in f3['Mean income']])

In [226]:
f3.head()

Unnamed: 0,Complaints,Zipcode,Median age,Sex ratio (males per 100 females),Population,Mean income
0,278416,11226,34.7,81.7,99026,58646.0
1,236724,10467,33.4,92.5,101134,46518.0
2,207677,10458,29.1,93.2,79974,41474.0
3,204803,11207,31.5,81.3,94657,47113.0
4,194782,10453,30.9,84.7,80081,33354.0


In [219]:
from sklearn import linear_model

y = f3['Complaints'].values.reshape(-1, 1)
x = f3.drop(['Complaints', 'Zipcode'], axis = 1)

regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Coefficients: \n', regr.coef_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(x) - y) ** 2))
print('Variance score: %.2f' % regr.score(x, y))

Coefficients: 
 [[ -2.96681086e+03  -7.34034185e+02   1.06010394e+00  -6.39268351e-02]]
Mean squared error: 847112781.07
Variance score: 0.52


In [221]:
import numpy as np
import statsmodels.api as sm
x = sm.add_constant(x)
mod = sm.OLS(np.asarray(y), np.asarray(x))
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.519
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     25.67
Date:                Thu, 27 Apr 2017   Prob (F-statistic):           1.97e-14
Time:                        17:12:39   Log-Likelihood:                -1169.8
No. Observations:                 100   AIC:                             2350.
Df Residuals:                      95   BIC:                             2363.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const        2.24e+05   4.72e+04      4.747      0.0

In [230]:
from scipy.stats import pearsonr 
r, p = pearsonr(f3['Median age'], f3['Complaints'])
print('Pearson’s correlation coefficient between Number of complaints and median age is {0} and its 2-tailed p-value is {1}'.format(r, p))
r, p = pearsonr(f3['Sex ratio (males per 100 females)'], f3['Complaints'])
print('Pearson’s correlation coefficient between Number of complaints and Sex ratio (males per 100 females) is {0} and its 2-tailed p-value is {1}'.format(r, p))
r, p = pearsonr(f3['Mean income'], f3['Complaints'])
print('Pearson’s correlation coefficient between Number of complaints and Mean income is {0} and its 2-tailed p-value is {1}'.format(r, p))
r, p = pearsonr(f3['Population'], f3['Complaints'])
print('Pearson’s correlation coefficient between Number of complaints and Population is {0} and its 2-tailed p-value is {1}'.format(r, p))

Pearson’s correlation coefficient between Number of complaints and median age is -0.3672753968165259 and its 2-tailed p-value is 0.00017070893526015382
(-0.24491210967014826, 0.014055035611854221)
(-0.43204457255750794, 7.1733861919714674e-06)
(0.63990707733162333, 7.6472175303692768e-13)
