In [None]:
import pandas as pd
import numpy as np
import statsmodels as sm
import sklearn as skl
import sklearn.preprocessing as preprocessing
import sklearn.linear_model as linear_model
import sklearn.cross_validation as cross_validation
import sklearn.metrics as metrics
import sklearn.tree as tree
import seaborn as sns
import matplotlib.pylab as plt
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean
import plotly.plotly as py

In [None]:
data = pd.read_csv("/home/harshil/BDAP/Python Code/adultdata.csv", names = ["Age", "Workclass", "fnlwgt", 
        "Education", "Education-Num", "Martial_Status","Occupation", "Relationship", "Race", "Sex",
        "Capital_Gain", "Capital_Loss","Hour", "Country", "Target"],
        sep=r'\s*,\s*',engine='python',na_values="?")

In [None]:
# ----------5 number summary----------------------
data.describe()

In [None]:
# ----------MEASURES OF VARIABILITY---------------
print data.std()
print data.quantile([.25,.5,.75])
print data.var()

In [None]:
#-----------QQ PLOTS------------------------------
import scipy.stats as stats
import pylab
stats.probplot(data["Age"], dist="norm", plot=pylab)
pylab.subplot(321)
stats.probplot(data["fnlwgt"], dist="norm", plot=pylab)
pylab.subplot(322)
stats.probplot(data["Education-Num"], dist="norm", plot=pylab)
pylab.subplot(323)
stats.probplot(data["Capital_Gain"], dist="norm", plot=pylab)
pylab.subplot(324)
stats.probplot(data["Capital_Loss"], dist="norm", plot=pylab)
pylab.subplot(325)
stats.probplot(data["Hour"], dist="norm", plot=pylab)
pylab.subplot(326)

pylab.show()

In [None]:
# -----------BOX PLOT-----------------------------
# import plotly.graph_objs as go
import math as m 
fig = plt.figure(figsize=(20,15))
cols = 3
rows = m.ceil(float(data.shape[1]) / cols)
j = 0 
for i, column in enumerate(data.columns):
    if data.dtypes[column] != np.object:
        j += 1
        ax = fig.add_subplot(2, cols, j)
        ax.set_title(column)
        plt.boxplot(data[column])

plt.subplots_adjust(hspace=0.7, wspace=0.2)
plt.show()

In [None]:
#------------HISTOGRAM-----------------------------
# Analyse Data

import math as m 
fig = plt.figure(figsize=(20,15))
cols = 5
rows = m.ceil(float(data.shape[1]) / cols)
for i, column in enumerate(data.columns):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(column)
    if data.dtypes[column] == np.object:
        data[column].value_counts().plot(kind="bar", axes=ax)
    else:
        data[column].hist(axes=ax)
        plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2)
plt.show()


In [None]:
(data["Country"].value_counts() / data.shape[0]).head()

## OBSERVATIONS
1. Most of the data is concentrated around US

### Need to   Correlation

1. Need to convert categorical variable into numerical variables. Using Label Encoder 
2. Remove correlated variables 

In [None]:
from sklearn import preprocessing
# Encode the categorical features as numbers
def number_encode_features(df):
    result = df.copy()
    encoders = {}
#     df.apply(preprocessing.LabelEncoder().fit_transform)
    for column in result.columns:
        if result.dtypes[column] == np.object:
            encoders[column] = preprocessing.LabelEncoder()
            result[column] = encoders[column].fit_transform(result[column])
    return result, encoders
    

# Calculate the correlation and plot it
encoded_data, _ = number_encode_features(data)
sns.heatmap(encoded_data.corr(), square=True)
plt.show()
# Label encoder function in scikit-learn package is used  

In [None]:
data[["Education", "Education-Num"]].head(15)

- We Remove Education column as Education and Education-Num are highly correlated.
- Now to model the data, we encode the categorical variable and create histograms

In [None]:
encoded_data, encoders = number_encode_features(data)
fig = plt.figure(figsize=(20,15))
cols = 5
rows = m.ceil(float(encoded_data.shape[1]) / cols)
for i, column in enumerate(encoded_data.columns):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(column)
    encoded_data[column].hist(axes=ax)
    plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2)
plt.show()

Before modelling, we divide the data into train and test sets. We scale all the data with mean 0 and variance 1 using StandardScalar in scikit-learn

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(encoded_data[encoded_data.columns - ["Target"]], encoded_data["Target"], train_size=0.70)
scaler = preprocessing.StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float64')), columns=X_train.columns)
X_test = scaler.transform(X_test.astype("float64"))
# ?X_train.astype

## Logistic Regression

In [None]:
cls = linear_model.LogisticRegression()

cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12,12))
plt.subplot(2,1,1)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=encoders["Target"].classes_, yticklabels=encoders["Target"].classes_)
plt.ylabel("Real value")
plt.xlabel("Predicted value")
print "F1 score: %f" % skl.metrics.f1_score(y_test, y_pred)
coefs = pd.Series(cls.coef_[0], index=X_train.columns)
coefs.sort()
plt.subplot(2,1,2)
coefs.plot(kind="bar")
plt.show()

Using Label encoding, the marital status values are ranging from 0 to 6 and the order is important. In practice there
is no particular order in that feature. We can fix the issue using binary features by inrtoducing dummy variables.

In [None]:
binary_data = pd.get_dummies(data)
# Let's fix the Target as it will be converted to dummy vars too
binary_data["Target"] = binary_data["Target_>50K"]
del binary_data["Target_<=50K"]
del binary_data["Target_>50K"]
plt.subplots(figsize=(20,20))
sns.heatmap(binary_data.corr(), square=True)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(binary_data[binary_data.columns - ["Target"]], binary_data["Target"], train_size=0.70)
scaler = preprocessing.StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = scaler.transform(X_test)

### Logistic regression with dummy variables

In [None]:
cls = linear_model.LogisticRegression()

cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)

plt.figure(figsize=(20,20))
plt.subplot(2,1,1)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=encoders["Target"].classes_, yticklabels=encoders["Target"].classes_)
plt.ylabel("Real value")
plt.xlabel("Predicted value")
print "F1 score: %f" % skl.metrics.f1_score(y_test, y_pred)
coefs = pd.Series(cls.coef_[0], index=X_train.columns)
coefs.sort_values(inplace=True)
ax = plt.subplot(2,1,2)
coefs.plot(kind="bar")
plt.show()

# Random Forest

In [61]:
# import libraries: dataframe manipulation, machine learning, os tools
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
 # Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier

# change working directory to where the dataset is
# os.chdir("C:/Users/JD87417/Desktop/python work/Coursera")

# Load the dataset (http://archive.ics.uci.edu/ml/datasets/Adult)
AH_data = pd.read_csv("/media/harshil/Harshil/Ubuntu/prepdata.csv")
# AH_data = AH_data.drop('(Intercept)',axis=1)
data_clean = AH_data.dropna()


# encode categorical features
# done in R (C:\Users\JD87417\Desktop\python work\Coursera\python_adult2_clean.R)

# summary statistics including counts, mean, stdev, quartiles
data_clean.head(n=1)
# data_clean.dtypes # data types of each variable
# data_clean.describe()


Unnamed: 0,age,workclassFederal-gov,workclassLocal-gov,workclassNever-worked,workclassPrivate,workclassSelf-emp-inc,workclassSelf-emp-not-inc,workclassState-gov,workclassWithout-pay,fnlwgt,...,native_countryPuerto-Rico,native_countryScotland,native_countrySouth,native_countryTaiwan,native_countryThailand,income_target_50k,native_countryUnited-States,native_countryVietnam,native_countryYugoslavia,income_target>50K
0,39,0,0,0,0,0,0,1,0,77516,...,0,0,0,0,0,0,1,0,0,0


In [None]:
data_clean["Education_num"].head(1)

In [62]:

# Split into training and testing sets
# Specifying predictor x variables
predictors = data_clean[["age", "workclassLocal-gov", "workclassPrivate",
"workclassSelf-emp-inc", "workclassSelf-emp-not-inc", "workclassState-gov",
"workclassWithout-pay", "fnlwgt", "education11th", "education12th",
"education1st-4th", "education5th-6th", "education7th-8th", "education9th",
"educationAssoc-acdm", "educationAssoc-voc", "educationBachelors",
"educationDoctorate", "educationHS-grad", "educationMasters",
"educationPreschool", "educationProf-school", "educationSome-college",
"Education_num", "martial_statusMarried-AF-spouse", "martial_statusMarried-civ-spouse",
"martial_statusMarried-spouse-absent", "martial_statusNever-married",
"martial_statusSeparated", "martial_statusWidowed", "occupationArmed-Forces",
"occupationCraft-repair", "occupationExec-managerial", "occupationFarming-fishing",
"occupationHandlers-cleaners", "occupationMachine-op-inspct",
"occupationOther-service", "occupationPriv-house-serv", "occupationProf-specialty",
"occupationProtective-serv", "occupationSales", "occupationTech-support",
"occupationTransport-moving", "relationshipNot-in-family", "relationshipOther-relative",
"relationshipOwn-child", "relationshipUnmarried", "relationshipWife",
"raceAsian-Pac-Islander", "raceBlack", "raceOther", "raceWhite",
"sexMale", "capital_gain", "capital_loss", "hours_per_week",
"native_countryCanada", "native_countryChina", "native_countryColumbia",
"native_countryCuba", "native_countryDominican-Republic", "native_countryEcuador",
"native_countryEl-Salvador", "native_countryEngland", "native_countryFrance",
"native_countryGermany", "native_countryGreece", "native_countryGuatemala",
"native_countryHaiti", "native_countryHoland-Netherlands", "native_countryHonduras",
"native_countryHong", "native_countryHungary", "native_countryIndia",
"native_countryIran", "native_countryIreland", "native_countryItaly",
"native_countryJamaica", "native_countryJapan", "native_countryLaos",
"native_countryMexico", "native_countryNicaragua", "native_countryOutlying-US(Guam-USVI-etc)",
"native_countryPeru", "native_countryPhilippines", "native_countryPoland",
"native_countryPortugal", "native_countryPuerto-Rico", "native_countryScotland",
"native_countrySouth", "native_countryTaiwan", "native_countryThailand",
"native_countryUnited-States",
"native_countryVietnam", "native_countryYugoslavia"]]


In [63]:
# y repsonse variable
targets = data_clean.income_target_50k

# concurrent split of x's, y, at 40%
pred_train, pred_test, tar_train, tar_test  = train_test_split(predictors, targets, test_size=.4)

# shape/dimensions of the DataFrame
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape

(13025,)

In [64]:
# Build model on training data
from sklearn.ensemble import RandomForestClassifier

# n_estimators is the amount of trees to build
classifier=RandomForestClassifier(n_estimators=25)
# fit the RandomForest Model
classifier=classifier.fit(pred_train,tar_train)
# prediction scoring of the model (array of binary 0-1)
predictions=classifier.predict(pred_test)
# confusion matrix / missclassification matrix
sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)


# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
print(model.feature_importances_)



[  8.75989687e-02   8.64956358e-03   6.89798080e-03   5.55811611e-04
   6.12607333e-04   1.47218203e-04   0.00000000e+00   6.41550562e-02
   3.10183300e-05   0.00000000e+00   8.34551969e-06   9.97222495e-03
   3.30840245e-04   1.20753296e-02   3.04431495e-02   7.46605375e-03
   3.64339708e-03   0.00000000e+00   1.67994217e-02   3.39920872e-04
   0.00000000e+00   9.72638858e-06   5.99572083e-03   2.39390314e-02
   0.00000000e+00   2.31511974e-02   5.18524459e-04   1.39254370e-02
   1.18212882e-02   1.44636166e-03   0.00000000e+00   4.74319816e-02
   6.51031017e-03   7.67389438e-04   1.09740283e-04   4.58004037e-02
   2.56867729e-02   5.56936594e-03   8.88077816e-03   1.33456253e-05
   1.24339121e-02   2.93306546e-02   7.96672405e-04   1.26353409e-02
   3.98140345e-03   1.28696309e-03   2.47594065e-03   1.16982362e-02
   7.03484468e-03   4.94392910e-02   5.41749525e-02   2.19499318e-02
   8.81922202e-03   2.95545572e-04   7.09546775e-03   1.23953494e-01
   0.00000000e+00   4.52278076e-05

In [65]:
print(max(model.feature_importances_))
max_val = np.where(model.feature_importances_ == max(model.feature_importances_))

min_val = np.where(model.feature_importances_ == min(model.feature_importances_))



0.12395349441


In [66]:
print(max_val, min_val)

((array([55]),), (array([ 6,  9, 17, 20, 24, 30, 56, 59, 62, 66, 67, 69, 70, 71, 72, 74, 75,
       79, 81, 82, 83, 88, 91, 94]),))


In [67]:
"""
Running a different number of trees and see the effect
 of that on the accuracy of the prediction
"""

trees=range(25)
accuracy=np.zeros(25)

for idx in range(len(trees)):
    classifier=RandomForestClassifier(n_estimators=idx + 1)
    classifier=classifier.fit(pred_train,tar_train)
    predictions=classifier.predict(pred_test)
    accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)

plt.cla()
plt.plot(trees, accuracy)
plt.show()