# Loading Modules

In [1]:
from matplotlib import interactive
import seaborn as sns
import numpy as np
import pandas as pd
import csv
import datetime
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report, recall_score, precision_recall_curve
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from patsy import dmatrices
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.grid_search import GridSearchCV
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

In [2]:
def prob_adjusted_outcome(model_probabilities,threshold):
    y_adjusted=[]
    for prob in model_probabilities:
        if prob[1] > threshold:
            y_adjusted.append(True)
        else:
            y_adjusted.append(False)
    return y_adjusted

# Loading Dataset and Reordering Data

In [3]:
filename='ATA_data.csv'

df = pd.read_csv(filename)
df=df.drop('Unnamed: 0',1)

df.columns.unique()

cats_to_keep=['entertainment', 'games', 'lifestyle', 'music', 'photography', 'productivity', 
             'social_networking', 'sports', 'travel', 'utilities']

df=df[df.app_type.isin(cats_to_keep)]

#### Taking interesting features

In [4]:
cols_to_keep=['installed','clicked','impression','app_type','weekday','hours','model','country','publisher_name','language','app_name']
filtered_df=df[cols_to_keep]

### Downsampling, remove the majority class by user set percentage

In [5]:
target_downsampling = 0.50;

False_df=filtered_df[filtered_df['installed'] == False]
Truth_df=filtered_df[filtered_df['installed'] == True]

downsampling=int(np.round((Truth_df.shape[0]/target_downsampling)*(1-target_downsampling)))

#Downsampling by target downsample rate
Downsampled_False=resample(False_df, n_samples=downsampling, random_state=0, replace = False)

# Merge the downsampled False with total True outcomes
less_imbalance=pd.concat([Downsampled_False,Truth_df])

In [6]:
#No downsampling dataset
clean_data=filtered_df

## Creating the dummy variables.

### Quick dirty comparison of AUC of ROC 

<p> Comparing the ROC_AUC of the complete dataset vs. downsampled dataset. This is used as an indication of model stability. </p>

<p> 

Use patsy to create the dummy variable

In [7]:
y_com,X_com = dmatrices('installed ~ hours + C(weekday) + C(app_type) + C(model) + C(country) + C(publisher_name) + C(language) + C(app_name)'
               ,less_imbalance, return_type='dataframe'
               )

In [8]:
y_com=np.ravel(y_com.iloc[:,1:])

In [9]:
y_com

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [10]:
roc_auc_com= cross_val_score(LogisticRegression(class_weight='auto', fit_intercept=False),
                          X_com, y_com, scoring='roc_auc', cv =5)

In [11]:
roc_auc_com

array([ 0.62480749,  0.72387839,  0.74818688,  0.77814556,  0.70591584])

In [12]:
y,X = dmatrices('installed ~ hours + C(weekday) + C(app_type)'
               ,less_imbalance, return_type='dataframe'
               )

In [13]:
y=np.ravel(y.iloc[:,1:])

In [14]:
roc_auc_dsampled= cross_val_score(LogisticRegression(class_weight='auto', fit_intercept=False),
                          X, y, scoring='roc_auc', cv =5)

In [15]:
roc_auc_diff= np.mean(roc_auc_com - roc_auc_dsampled)
print roc_auc_diff

0.156857732562


## Running different models.

In [None]:
rm=[LogisticRegression(fit_intercept=False), RandomForestClassifier(), LinearSVC(), GradientBoostingClassifier(max_depth=3)]
cvs=5
model_aucs=np.empty((len(rm),cvs))
model_recalls=np.empty((len(rm),cvs))
model_precision=np.empty((len(rm),cvs))

for i,models in enumerate(rm):
    
    model_recalls[i,:] = cross_val_score(models, X, y, scoring = 'recall', cv = cvs)
    model_precision[i,:] = cross_val_score(models, X, y, scoring = 'precision', cv = cvs)
    model_aucs[i,:] = cross_val_score(models, X, y, scoring = 'roc_auc', cv = cvs)


In [None]:
model_precision

In [None]:
summary_matrices = np.empty((6,len(rm)))
summary_matrices[0, :] = np.mean(model_recalls, axis=1)
summary_matrices[1, :] = np.std(model_recalls, axis = 1)
summary_matrices[2, :] = np.mean(model_precision, axis=1)
summary_matrices[3, :] = np.std(model_precision, axis = 1)
summary_matrices[4, :] = np.mean(model_aucs, axis=1)
summary_matrices[5, :] = np.std(model_aucs, axis = 1)
print summary_matrices

In [None]:
summary_df=pd.DataFrame(summary_matrices)
summary_df.columns=['Logistic Regression', 'Random Forest', 'Linear SVC', 'GradientBoostClassifier']
summary_df.index=['Recall mean', 'Recall std', 'Precision mean', 'Precision std', 'ROC_AUC mean', 'ROC_AUC std']

In [None]:
summary_df

#### Using 90% of data to train.

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_com, y_com, test_size=0.1, random_state=0)

## Logistic Regression 

In [17]:
modelLogR2 = LogisticRegression(class_weight='auto',fit_intercept=False);
modelLogR2.fit(X_train, y_train);

In [18]:
pd.DataFrame(zip(X_train.columns, np.transpose(modelLogR2.coef_)))

Unnamed: 0,0,1
0,Intercept,[-1.01968882484]
1,C(weekday)[T.2],[0.0236734393796]
2,C(weekday)[T.3],[-0.221296974624]
3,C(weekday)[T.4],[-0.186843372066]
4,C(weekday)[T.5],[-0.0227782555788]
5,C(weekday)[T.6],[-0.0430284557223]
6,C(weekday)[T.7],[0.0403258491747]
7,C(app_type)[T.games],[0.544265196089]
8,C(app_type)[T.lifestyle],[-0.208480088555]
9,C(app_type)[T.music],[-0.225244779406]


In [19]:
#Use Training Set to verify if model is learning anything
verify_train = modelLogR2.predict(X_train)
prob_train = modelLogR2.predict_proba(X_train)
recall_score(y_train,verify_train)




0.84992784992784998

In [20]:
#metrics for recall and confusion matrix

print confusion_matrix(y_train, verify_train, labels=[True, False]).transpose()
print classification_report(y_train, verify_train)

[[589 216]
 [104 478]]
             precision    recall  f1-score   support

        0.0       0.82      0.69      0.75       694
        1.0       0.73      0.85      0.79       693

avg / total       0.78      0.77      0.77      1387



In [21]:
#Prediction using test set
predictedLR2 = modelLogR2.predict(X_test)
probLR2 = modelLogR2.predict_proba(X_test)
recall_score(y_test,predictedLR2)




0.83333333333333337

In [22]:
#metrics for recall and confusion matrix

cm = confusion_matrix(y_test, predictedLR2, labels=[True, False]).transpose()
print cm
print classification_report(y_test, predictedLR2)

[[65 36]
 [13 41]]
             precision    recall  f1-score   support

        0.0       0.76      0.53      0.63        77
        1.0       0.64      0.83      0.73        78

avg / total       0.70      0.68      0.68       155



In [None]:
y_adjusted=prob_adjusted_outcome(probLR2,0.53)

In [None]:
cm_adj = confusion_matrix(y_test, y_adjusted, labels=[True, False]).transpose()
print cm_adj
print classification_report(y_test, y_adjusted)

In [None]:
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label');
plt.xlabel('Predicted label');

In [None]:
filename='ATA2_data.csv'

df2 = pd.read_csv(filename)
df2=df2.drop('Unnamed: 0',1)

df2.columns.unique()

cats_to_keep=['entertainment', 'games', 'lifestyle', 'music', 'photography', 'productivity', 
             'social_networking', 'sports', 'travel', 'utilities']

df2=df2[df2.app_type.isin(cats_to_keep)]

In [None]:
cols_to_keep=['installed','clicked','impression','app_type','weekday','hours', 'week']
filtered_df2=df2[cols_to_keep]

In [None]:
np.mean(filtered_df2['installed'])

In [None]:
False_df=df2[filtered_df['installed'] == False]
Truth_df=df2[filtered_df['installed'] == True]

target_downsampling = 0.50;
downsampling=int(np.round((Truth_df.shape[0]/target_downsampling)*(1-target_downsampling)))



#Downsampling by target downsample rate
df2_false=resample(False_df, n_samples=downsampling, random_state=0, replace = False)
df2_truth=
# Merge the downsampled False with total True outcomes
less_imbalance=pd.concat([df2_false,Truth_df])

In [None]:
y_val,X_val = dmatrices('installed ~ hours + C(weekday) + C(app_type)'
               ,df2, return_type='dataframe'
               )

In [None]:
y_val=np.ravel(y_val.iloc[:,1:])

In [None]:
#Validate using validation set
predictedVal = modelLogR2.predict(X_val)
probVal = modelLogR2.predict_proba(X_val)
recall_score(y_val,predictedVal)




In [None]:
#metrics for recall and confusion matrix

cm = confusion_matrix(y_val, predictedVal, labels=[True, False]).transpose()
print cm
print classification_report(y_val, predictedVal)

In [None]:
y_adjusted=prob_adjusted_outcome(probVal,0.52)

In [None]:
cm_adj = confusion_matrix(y_val, y_adjusted, labels=[True, False]).transpose()
print cm_adj
print classification_report(y_val, y_adjusted)

In [None]:
y_probVal=prob_class_outcome(probVal,1)

In [None]:
plt.hist(y_probVal)

In [None]:
Validation_prob=pd.DataFrame(df2['installed'])

In [None]:
Validation_prob['week']=df2['week']

In [None]:
Validation_prob['Prob']=y_probVal

In [None]:
Validation_prob.shape

In [None]:
prob_threshold=0.4

In [None]:
lessThan20Per=Validation_prob[Validation_prob['Prob'] > prob_threshold ]

In [None]:
lessThan20Per.shape

In [None]:
drop_index=Validation_prob[np.logical_and(Validation_prob['Prob'] > prob_threshold , Validation_prob['installed'])]

In [None]:
drop_index.shape

In [None]:
Validation_prob.groupby('week').sum()

In [None]:
drop_index.groupby('week').sum()

In [None]:
def TrueFalse(value):
    if value[0]=='F':
        value = False
    else:
        value = True
    return value

In [None]:
df2.dtypes

In [None]:
Validation_prob.info()

In [None]:
Validation_prob.groupby('week')['Prob','installed'].sum()

In [None]:
Val_df=Validation_prob[Validation_prob['installed'] == True]

In [None]:
Val_df.shape

In [None]:
np.sum(np.array(y_probVal))/np.sum(y_val)

In [None]:
y_adjusted=prob_adjusted_outcome(probLR2,0.5)

In [None]:
print classification_report(y_test, y_adjusted)

In [None]:
def prob_class_outcome(model_probabilities,select_class):
    prob_class=[]
    for prob in model_probabilities:
            prob_class.append(prob[select_class])
    return prob_class

In [None]:
y_prob=prob_class_outcome(probLR2,1)

In [None]:
np.sum(np.array(y_prob))/np.sum(y_test)

In [None]:
np.sum(y_test)

In [None]:

# Precision Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test,y_prob)

In [None]:
plt.plot(recall,precision)

### Gradient Boosting Classifier

In [None]:
modelGBC = GradientBoostingClassifier(max_features='auto');
modelGBC.fit(X_train, y_train);

In [None]:
param_grid = {"loss": ['deviance', 'exponential'],
              "learning_rate": [0.05, 0.1, 0.2, 0.5],
              "n_estimators": [50, 100, 500, 1000],
              "max_depth": [1, 3, 5, 10]
              "max_features": ['auto','sqrt','log2']}

In [None]:
Psearch=GridSearchCV(GradientBoostingClassifier(), param_grid, scoring="precision")

In [None]:
GridGBCmodel=Psearch.fit(X_train,y_train)

In [None]:
pd.DataFrame(zip(X_train.columns, np.transpose(modelGBC.feature_importances_)))

In [None]:
#Use Training Set to verify if model is learning anything
verify_train = modelGBC.predict(X_train)
prob_train = modelGBC.predict_proba(X_train)
recall_score(y_train,verify_train)

In [None]:
#metrics for recall and confusion matrix

print confusion_matrix(y_train, verify_train, labels=[True, False]).transpose()
print classification_report(y_train, verify_train)

In [None]:
#Prediction using test set
predictedGBC = modelGBC.predict(X_val)
probGBC = modelGBC.predict_proba(X_val)
recall_score(y_val,predictedGBC)

In [None]:
#metrics for recall and confusion matrix

cm = confusion_matrix(y_val, predictedGBC, labels=[True, False]).transpose()
print cm
print classification_report(y_val, predictedGBC)

In [None]:
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label');
plt.xlabel('Predicted label');

In [None]:
np.mean(y_test)

#### Rank Users based on predicted probabilities

In [None]:
ranked_df=pd.DataFrame(predictedGBC)
ranked_df['Probabilities'] = prob_class_outcome(probGBC,1)
ranked_df.columns=['Predicted Outcomes','Probabilities']
ranked_df.sort(columns='Probabilities')

In [None]:
rank_percentile = np.percentile(ranked_df['Probabilities'],1)

In [None]:
plt.hist(ranked_df['Probabilities'])

In [None]:
rank_percentile

In [None]:
X_train.shape

### Random Forests

In [23]:
ratio=np.mean(df['installed'])

In [24]:
#Downsampling by target downsample rate
TotalData=100000
Fsamples=int(np.round(TotalData*(1-ratio)))

In [25]:
Tsamples+Fsamples

NameError: name 'Tsamples' is not defined

In [26]:
False_df=filtered_df[filtered_df['installed'] == False]
Truth_df=filtered_df[filtered_df['installed'] == True]

df2_false=resample(False_df, n_samples=Fsamples, random_state=0, replace = False)
df2_truth=resample(Truth_df, n_samples=TotalData-Fsamples, random_state=0, replace = False)
# Merge the downsampled False with total True outcomes
less_imbalance=pd.concat([df2_false,df2_truth])

In [27]:
y_com,X_com = dmatrices('installed ~ hours + C(weekday) + C(app_type) + C(model) + C(country) + C(publisher_name) + C(language) + C(app_name)'
               ,less_imbalance, return_type='dataframe'
               )

In [28]:
y_com=np.ravel(y_com.iloc[:,1:])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_com,y_com, test_size=0.3, random_state=0)

In [30]:
modelRandomForest= RandomForestClassifier(class_weight='auto', n_estimators=100)
modelRandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
importances = modelRandomForest.feature_importances_
std = np.std([tree.feature_importances_ for tree in modelRandomForest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [32]:
small_indices=[1487,103, 4 , 52, 50 , 5, 47, 6, 40, 7, 43, 694, 217, 242,1, 1291, 215]

In [33]:
X_train.columns[small_indices]

Index([u'hours', u'C(publisher_name)[T.Ensolight Interactive]',
       u'C(weekday)[T.5]', u'C(country)[T.USA]', u'C(model)[T.iPod Touch 5G]',
       u'C(weekday)[T.6]', u'C(model)[T.iPhone 6]', u'C(weekday)[T.7]',
       u'C(model)[T.iPhone 4S]', u'C(app_type)[T.games]',
       u'C(model)[T.iPhone 5c (GSM)]', u'C(app_name)[T.Fat No More - iOS]',
       u'C(publisher_name)[T.Scopely, Inc.]', u'C(publisher_name)[T.Tapps]',
       u'C(weekday)[T.2]', u'C(app_name)[T.Swamp Attack - iPhone]',
       u'C(publisher_name)[T.SEGA]'],
      dtype='object')

In [34]:
index_plot=['hours','Pub_name\nE', 'Day 5', 'Country - USA', 'model\niPod Touch 5g', 'Day 6', 'model\niPhone6', 'Day 7',
       'model\niPhone 4S', 'Ad_venue\ngames',
       'model\niPhone 5c', 'App_name\nF',
       'Pub_name\nS', 'Pub_name\nT',
       'Day 2', 'App_name\nS',
       'Pub_name\nSE']

In [35]:
np.arange(1,34, 2)

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33])

In [36]:
len(index_plot)

17

In [37]:
plt.figure()
plt.title("Feature importances")
plt.bar(np.arange(1,34, 2), importances[small_indices],
       color="r", align="center")
plt.xticks(np.arange(1,34, 2), index_plot, rotation= 90)
plt.xlim([-1, 34])
plt.show()

In [None]:
X_train.columns[356]

In [None]:
X_train.columns[117]

In [None]:
# Random Forest Predictions
predictedRF = modelRandomForest.predict(X_train)
probs = modelRandomForest.predict_proba(X_train)

In [None]:
# Look at precision/recall and confusion matrix

print confusion_matrix(y_train,predictedRF)
print classification_report(y_train,predictedRF)

In [None]:
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [None]:
grid_search = GridSearchCV(modelRandomForest, param_grid=param_grid)

In [None]:
for params, mean_score, scores in ZZ.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"%(mean_score, scores.std() * 2, params))

In [None]:
ZZ=grid_search.fit(X_train,y_train)

In [None]:
ZZ.best_params_

In [None]:
XX=ZZ.predict(X_test)

In [None]:
print confusion_matrix(y_test,XX)
print classification_report(y_test,XX)