In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import pandas as pd
from sklearn import preprocessing

In [None]:
DATADIR = "Datasets"
UK = "UK/input.txt"

In [None]:
datapath = os.path.join(DATADIR , UK)
print (datapath)
descrpath = os.path.join(DATADIR, "UK/explanation.csv")

In [None]:
descr = pd.read_csv(descrpath)
print(descr)

data = pd.read_csv(datapath, header = None, skipinitialspace = True, sep = " ", names = descr['Name'])

In [None]:
data.head(30)

In [None]:
%matplotlib inline
data.hist(bins = 50, figsize =(20,15))

In [None]:
corr_matrix = data.corr()
corr_matrix["Censor"].sort_values(ascending=False)

In [None]:
data.isnull().values.any() #no missing values

In [None]:
#separate labels and identify column type
cs_data= data.drop("Censor", axis =1)
cs_labels = data["Censor"].copy()
cs_cat = cs_data[["Custgend","Freqpaid","Homephon","Loantype","Marstat","Homeowns","Purpose","Censore"]]
cs_numerical = cs_data.drop(columns =cs_cat)

#numerical pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('mm_scaler', StandardScaler()),
])

#full pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(cs_numerical)
cat_attribs = list(cs_cat)

full_pipeline = ColumnTransformer([
             ("num", num_pipeline, num_attribs),
             ("cat", OneHotEncoder(categories="auto"), cat_attribs)
        ])

#transform all data
cs_prepared = full_pipeline.fit_transform(cs_data)
cs_prepared.shape

#split data into train and test sets
from sklearn.model_selection import train_test_split
train_set, test_set, train_set_labels, test_set_labels = train_test_split(cs_prepared, cs_labels, test_size =0.2)
print train_set[0]
print train_set_labels[0]

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegressionCV

y = train_set_labels.astype(np.int)
X= train_set

log_reg = LogisticRegressionCV(solver = 'lbfgs', max_iter=1000, cv=5)
log_reg.fit(X,y)
log_reg.predict(X)
log_reg.predict_proba(X)
log_reg.score(X,y)

In [None]:
yt = test_set_labels.astype(np.int)
Xt = test_set

log_reg.score(Xt,yt)

In [None]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
gbrt = GradientBoostingRegressor(max_depth=3, n_estimators = 10, learning_rate=1)
gbrt.fit(X, y)
gbrt.score(X,y)

In [None]:
gbrt.fit(Xt, yt)
gbrt.score(Xt,yt)

In [None]:
#split transformed data into train and test sets
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, cs_prepared, cs_labels, cv=10)
print(scores)