In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [4]:
train_df = pd.read_csv("./data/train.tsv", delimiter='\t')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 15 columns):
id                              89 non-null int64
Y                               89 non-null int64
Alcohol                         89 non-null float64
Malic acid                      89 non-null float64
Ash                             89 non-null float64
Alcalinity of ash               89 non-null float64
Magnesium                       89 non-null int64
Total phenols                   89 non-null float64
Flavanoids                      89 non-null float64
Nonflavanoid phenols            89 non-null float64
Proanthocyanins                 89 non-null float64
Color intensity                 89 non-null float64
Hue                             89 non-null float64
OD280/OD315 of diluted wines    89 non-null float64
Proline                         89 non-null int64
dtypes: float64(11), int64(4)
memory usage: 10.5 KB


In [5]:
train_df.head(10)

Unnamed: 0,id,Y,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,0,3,12.36,3.83,2.38,21.0,88,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520
1,1,2,12.42,4.43,2.73,26.5,102,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365
2,3,1,13.05,1.77,2.1,17.0,107,3.0,3.0,0.28,2.03,5.04,0.88,3.35,885
3,6,1,13.05,1.65,2.55,18.0,98,2.45,2.43,0.29,1.44,4.25,1.12,2.51,1105
4,9,1,13.05,2.05,3.22,25.0,124,2.63,2.68,0.47,1.92,3.58,1.13,3.2,830
5,11,3,12.93,2.81,2.7,21.0,96,1.54,0.5,0.53,0.75,4.6,0.77,2.31,600
6,12,3,13.36,2.56,2.35,20.0,89,1.4,0.5,0.37,0.64,5.6,0.7,2.47,780
7,13,2,12.29,1.41,1.98,16.0,85,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428
8,14,1,13.07,1.5,2.1,15.5,98,2.4,2.64,0.28,1.37,3.7,1.18,2.69,1020
9,15,1,13.74,1.67,2.25,16.4,118,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060


In [6]:
train_df.isnull().sum()

id                              0
Y                               0
Alcohol                         0
Malic acid                      0
Ash                             0
Alcalinity of ash               0
Magnesium                       0
Total phenols                   0
Flavanoids                      0
Nonflavanoid phenols            0
Proanthocyanins                 0
Color intensity                 0
Hue                             0
OD280/OD315 of diluted wines    0
Proline                         0
dtype: int64

In [7]:
train_corr = train_df.corr()
train_corr

Unnamed: 0,id,Y,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
id,1.0,0.023779,0.012532,0.077124,0.08116,-0.027558,-0.136686,0.095052,0.063847,0.048068,0.186814,0.032114,0.032548,0.012566,0.038418
Y,0.023779,1.0,-0.258755,0.48027,-0.012591,0.513783,-0.24464,-0.705636,-0.838767,0.445491,-0.490509,0.231353,-0.594879,-0.748733,-0.642625
Alcohol,0.012532,-0.258755,1.0,0.114406,0.178058,-0.285264,0.234347,0.235593,0.176669,-0.050913,0.152768,0.563076,-0.126136,-0.010794,0.616025
Malic acid,0.077124,0.48027,0.114406,1.0,0.216048,0.314007,-0.133594,-0.362064,-0.417656,0.381803,-0.22818,0.286388,-0.595126,-0.42366,-0.230116
Ash,0.08116,-0.012591,0.178058,0.216048,1.0,0.518484,0.233568,0.247052,0.214522,0.284366,0.151467,0.286958,-0.168341,0.047649,0.16375
Alcalinity of ash,-0.027558,0.513783,-0.285264,0.314007,0.518484,1.0,-0.024986,-0.224817,-0.2737,0.390777,-0.148378,0.075292,-0.266425,-0.194741,-0.418648
Magnesium,-0.136686,-0.24464,0.234347,-0.133594,0.233568,-0.024986,1.0,0.248484,0.269803,-0.260889,0.223654,0.114945,0.133985,0.144553,0.352766
Total phenols,0.095052,-0.705636,0.235593,-0.362064,0.247052,-0.224817,0.248484,1.0,0.810988,-0.340374,0.577898,0.008251,0.415903,0.671466,0.566918
Flavanoids,0.063847,-0.838767,0.176669,-0.417656,0.214522,-0.2737,0.269803,0.810988,1.0,-0.48159,0.641146,-0.101357,0.512982,0.758471,0.544615
Nonflavanoid phenols,0.048068,0.445491,-0.050913,0.381803,0.284366,0.390777,-0.260889,-0.340374,-0.48159,1.0,-0.321861,0.21549,-0.288939,-0.501244,-0.251767


In [8]:
train_df["Y"].unique().size

3

In [None]:
from my_modules.my_encoder import CategoryValueEncoder as CVE
from my_modules.my_encoder import TextValueEncoder as TVE
from my_modules.my_encoder import DateValueEncoder as DVE
from my_modules.my_encoder import NumericValueEncoder as NVE

train_df_encode = pd.DataFrame()
train_df_encode = pd.concat([train_df_encode,
                            train_df['project_is_approved'],
                            CVE(train_df['project_grade_category']).to_binary_encoding(),
                            CVE(train_df['school_state']).to_binary_encoding(),
                            CVE(train_df['teacher_prefix']).to_binary_encoding(),
                            CVE(DVE(train_df['project_submitted_datetime']).to_year()).to_binary_encoding(),
                            CVE(DVE(train_df['project_submitted_datetime']).to_month()).to_binary_encoding(),
                            CVE(DVE(train_df['project_submitted_datetime']).to_day()).to_binary_encoding(),
                            CVE(DVE(train_df['project_submitted_datetime']).to_dayofweek()).to_binary_encoding(),
                            TVE(train_df['project_subject_categories']).to_bow_encoding(),
                            TVE(train_df['project_subject_subcategories']).to_bow_encoding(),  
                            NVE(train_df['teacher_number_of_previously_posted_projects']).normalize()                          
                           ], axis=1) 

In [None]:
train_df_encode.info()

In [None]:
train_df_encode.head(5)

In [None]:
train_df_encode.to_csv("train_df_encode.csv", index=False)