# MAIN NOTEBOOK

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.decomposition import PCA
from xgboost import plot_tree
import matplotlib.pyplot as plt
#plt.rcParams["figure.figsize"] = (20,20)

In [9]:
predictors = pd.read_csv(filepath_or_buffer = "predictors.csv")
print(predictors.head())


   id  bonus        date  color date_format
0   1   0.39  07/31/2020  green    %m/%d/%Y
1   2   0.28  11/04/2020    red    %m/%d/%Y
2   3   0.27  2020-05-12    red    %Y-%m-%d
3   4   0.28  09/17/2020    red    %m/%d/%Y
4   5   0.35  2020-12-15  green    %Y-%m-%d


In [10]:
results = pd.read_csv("results.tsv",sep='\t')
print(results.head())


    id  outcome
0  122        1
1  138        0
2  330        1
3   97        1
4  322        1


In [14]:
results = results.sort_values(by = "id")
print(results.head())
print(results.shape)

     id  outcome
153   1        1
54    2        0
15    3        0
40    5        1
20   11        1
(200, 2)


In [13]:
join_df = pd.merge(predictors, results, on = "id", how = "inner")
print(join_df.head())
print(join_df.shape)

   id  bonus        date  color date_format  outcome
0   1   0.39  07/31/2020  green    %m/%d/%Y        1
1   2   0.28  11/04/2020    red    %m/%d/%Y        0
2   3   0.27  2020-05-12    red    %Y-%m-%d        0
3   5   0.35  2020-12-15  green    %Y-%m-%d        1
4  11   0.09  01/17/2020   blue    %m/%d/%Y        1
(200, 6)


In [15]:
print(join_df.dtypes)

id               int64
bonus          float64
date            object
color           object
date_format     object
outcome          int64
dtype: object


In [18]:
from datetime import datetime as dt
datetime_object = dt.strptime('07/31/2020', '%m/%d/%Y')


2020-07-31 00:00:00


In [20]:
mini = join_df["date"][0]
mini_format = join_df["date_format"][0]

def date_convert(dat,form):
    datetime_object = dt.strptime(str(dat), str(form))
    return datetime_object

object = date_convert(mini, mini_format)
print(object)


2020-07-31 00:00:00


In [23]:
new_dates = []
for dat, form in zip(join_df["date"], join_df["date_format"]):
    object = date_convert(dat, form)
    new_dates.append(object)

print(new_dates[-5:-1])
print(join_df["date"][-5:-1])
print(join_df["date_format"][-5:-1])
    


[datetime.datetime(2020, 8, 24, 0, 0), datetime.datetime(2020, 9, 11, 0, 0), datetime.datetime(2020, 9, 19, 0, 0), datetime.datetime(2020, 12, 3, 0, 0)]
195    08/24/2020
196    09/11/2020
197    19.09.2020
198    03.12.2020
Name: date, dtype: object
195    %m/%d/%Y
196    %m/%d/%Y
197    %d.%m.%Y
198    %d.%m.%Y
Name: date_format, dtype: object


In [24]:
join_df["new_date"] = new_dates
print(join_df.head())

   id  bonus        date  color date_format  outcome   new_date
0   1   0.39  07/31/2020  green    %m/%d/%Y        1 2020-07-31
1   2   0.28  11/04/2020    red    %m/%d/%Y        0 2020-11-04
2   3   0.27  2020-05-12    red    %Y-%m-%d        0 2020-05-12
3   5   0.35  2020-12-15  green    %Y-%m-%d        1 2020-12-15
4  11   0.09  01/17/2020   blue    %m/%d/%Y        1 2020-01-17


In [25]:
new_df = join_df.drop(columns = ["date", "date_format"])
print(new_df.head())

   id  bonus  color  outcome   new_date
0   1   0.39  green        1 2020-07-31
1   2   0.28    red        0 2020-11-04
2   3   0.27    red        0 2020-05-12
3   5   0.35  green        1 2020-12-15
4  11   0.09   blue        1 2020-01-17


In [31]:
color_set = new_df["color"].unique()
new_df["color"] = [s.strip() for s in new_df["color"]]
color_set2 = new_df["color"].unique()
print(color_set2)

['green' 'red' 'blue']


In [34]:
new_df["green"] = [1 if s =='green' else 0 for s in new_df["color"]]
new_df["red"] = [1 if s =='red' else 0 for s in new_df["color"]]
new_df["blue"] = [1 if s =='blue' else 0 for s in new_df["color"]] ### exclude intercept in linear model if all 3 used

In [35]:
print(new_df.head())

   id  bonus  color  outcome   new_date  green  red  blue
0   1   0.39  green        1 2020-07-31      1    0     0
1   2   0.28    red        0 2020-11-04      0    1     0
2   3   0.27    red        0 2020-05-12      0    1     0
3   5   0.35  green        1 2020-12-15      1    0     0
4  11   0.09   blue        1 2020-01-17      0    0     1


In [37]:
print(new_df.groupby("color")["outcome"].mean())



color
blue     0.649123
green    0.330000
red      0.534884
Name: outcome, dtype: float64


In [39]:
print(min(new_dates))
print(max(new_dates))
print(set(new_dates))

2020-01-04 00:00:00
2020-12-29 00:00:00
{datetime.datetime(2020, 11, 4, 0, 0), datetime.datetime(2020, 10, 10, 0, 0), datetime.datetime(2020, 10, 19, 0, 0), datetime.datetime(2020, 10, 26, 0, 0), datetime.datetime(2020, 3, 9, 0, 0), datetime.datetime(2020, 11, 28, 0, 0), datetime.datetime(2020, 1, 11, 0, 0), datetime.datetime(2020, 6, 25, 0, 0), datetime.datetime(2020, 7, 25, 0, 0), datetime.datetime(2020, 3, 21, 0, 0), datetime.datetime(2020, 9, 16, 0, 0), datetime.datetime(2020, 12, 27, 0, 0), datetime.datetime(2020, 7, 8, 0, 0), datetime.datetime(2020, 9, 24, 0, 0), datetime.datetime(2020, 3, 8, 0, 0), datetime.datetime(2020, 9, 11, 0, 0), datetime.datetime(2020, 2, 18, 0, 0), datetime.datetime(2020, 7, 2, 0, 0), datetime.datetime(2020, 11, 17, 0, 0), datetime.datetime(2020, 3, 16, 0, 0), datetime.datetime(2020, 6, 28, 0, 0), datetime.datetime(2020, 5, 9, 0, 0), datetime.datetime(2020, 3, 26, 0, 0), datetime.datetime(2020, 11, 21, 0, 0), datetime.datetime(2020, 9, 13, 0, 0), datetim

In [51]:
## Modelling
from sklearn.linear_model import LogisticRegression

train_df = new_df.drop(columns = ["id", "new_date","outcome","color"])
columns = train_df.columns

train_df = train_df.to_numpy()
train_labels = new_df["outcome"].to_numpy()

clf = LogisticRegression(random_state = 0, fit_intercept = False).fit(train_df, train_labels)

predictions = clf.predict(train_df)
probabilities = clf.predict_proba(train_df)

print(predictions)
print(probabilities)

[0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1
 0 1 1 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0
 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0
 0 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 0 1
 0 0 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0
 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1]
[[0.6643114  0.3356886 ]
 [0.47573985 0.52426015]
 [0.4743013  0.5256987 ]
 [0.65914644 0.34085356]
 [0.35379375 0.64620625]
 [0.36307914 0.63692086]
 [0.67198541 0.32801459]
 [0.66173374 0.33826626]
 [0.3617462  0.6382538 ]
 [0.66173374 0.33826626]
 [0.67071264 0.32928736]
 [0.66687931 0.33312069]
 [0.47286319 0.52713681]
 [0.65784921 0.34215079]
 [0.6643114  0.3356886 ]
 [0.36575133 0.63424867]
 [0.66559659 0.33440341]
 [0.37787673 0.62212327]
 [0.47142552 0.52857448]
 [0.66943736 0.33056264]
 [0.36575133 0.63424867]
 [0.65914644 0.34085356]
 [0.66687931 0.33312069]
 [0.67198541 0.328

In [48]:
score = clf.score(train_df, train_labels)

In [52]:
print(clf.coef_)
print(columns)

[[-0.57685782 -0.45759254  0.25863706  0.65432183]]
Index(['bonus', 'green', 'red', 'blue'], dtype='object')
