In [18]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import preprocessing

In [19]:
df = pd.read_csv("ch5_input/train.csv", encoding = "utf-8")
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [20]:
import seaborn as sns
# create counterplot on target var

ModuleNotFoundError: No module named 'seaborn'

In [None]:
df.target.value_counts()

0    487677
1    112323
Name: target, dtype: int64

In [None]:
## The data is skewed. There are less positive targets. So, using the metric Area under the curve AOC.

In [21]:
df.ord_2.unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [22]:
## There are 6 unique variables - ordinal data
## ML models do not understand categorical variables.
## Simplest way - Create a dict with text variables to number - Label Encoding

mapping = {
    "Freezing": 0,
    "Warm":1,
    "Cold":2,
    "Boiling Hot":3,
    "Hot":4,
    "Lava Hot":5
}

In [23]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [24]:
df.loc[:,"ord_2"] = df.ord_2.map(mapping)
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [25]:
## Using sklearn to perform Label Encoding
df.ord_1.value_counts()

Novice         160597
Expert         139677
Contributor    109821
Grandmaster     95866
Master          75998
Name: ord_1, dtype: int64

In [26]:
#Create label enc object
# Perform Fillna as encoding cannot handle null values

label_enc = preprocessing.LabelEncoder()
df.loc[:, "ord_1"] = df.ord_1.fillna("NA")
df.loc[:, "ord_1"] = label_enc.fit_transform(df.ord_1.values)


In [27]:
df.ord_1.value_counts()

5    160597
1    139677
0    109821
2     95866
3     75998
4     18041
Name: ord_1, dtype: int64

In [28]:
## Sparse matrix
example = np.array(
    [
        [0,0,1],
        [1,0,1],
        [1,1,0],
    ]
)

example.nbytes

36

In [29]:
## Convert any numpy array to sparse matrix as
from scipy import sparse

sparse_example = sparse.csr_matrix(example)
sparse_example.data.nbytes

20

In [30]:
## Sparse matrix takes less memory than numpy array as it saves only non zero values and their corresponding indexes

In [31]:
## Three values are important in a sparse matrix
print(sparse_example.data)
print(sparse_example.indptr)
print(sparse_example.indices)

[1 1 1 1 1]
[0 1 3 5]
[2 0 2 0 1]


In [32]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,0,4.0,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,2,1.0,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,4,0.0,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,5,5.0,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,2,2.0,h,C,OZ,5.0,12.0,0


In [33]:
## Filtering ids where ord_2 is Boiling Hot
df[df.ord_2 == "Boiling Hot"].shape

(0, 25)

In [34]:
## We can use groupby in pandas
df.groupby(["ord_2"])["id"].count()

ord_2
0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: id, dtype: int64

In [35]:
## Transforming ord_2 to ratio from overall values (numeric values)
df.groupby(["ord_2"])["id"].transform("count")

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [36]:
## Taking two groups into consideration
df.groupby(["ord_1", "ord_2"])["id"].count().reset_index(name="Count").head(10)

Unnamed: 0,ord_1,ord_2,Count
0,0,0.0,26082
1,0,1.0,22774
2,0,2.0,17734
3,0,3.0,15634
4,0,4.0,12428
5,0,5.0,11919
6,1,0.0,33249
7,1,1.0,28900
8,1,2.0,22956
9,1,3.0,19477


In [37]:
## We can also combine two or more features instead os grouping it. We need domain knowlege to select/group features
## Here is a sample - We concatenate using _

df["new_feature"] = ( df.ord_1.astype(str) + "_" + df.ord_2.astype(str) + "_" + df.ord_3.astype(str) )


## Filling NaN values are so important before encoding
- We can simply drop Nan Values. Which is not ideal in handlying NaN values
- Consider NaN values as separate category

In [38]:
# Here considering NaN values as separate category which abhi said "the most preferred way of handling NaN values"
# We have null values in ord_2 column

df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [59]:
# Considering NaN values as separate category
# We are filling it with "None" or "Rare" or "Unknown" string
# This Rare category includes many different categories

# df.ord_2.fillna("None").value_counts()
df.ord_2.fillna("Rare", inplace=True)
df.ord_2.value_counts()

0.0     142726
1.0     124239
2.0      97822
3.0      84790
4.0      67508
5.0      64840
Rare     18075
Name: ord_2, dtype: int64

In [40]:
# Here None has 18075 value counts
# Also the total number of categories in the ordinal column "ord_2" is now 7 not 6 any more.
# We can use KNN clustering to predict the unknown values but when we do so the predicted value belongs something from the training data.
# But what if there exist a new category in the test data. (This way can solve the problem depending upon the dataset - JUST TRY Experimenting;)


In [41]:
"""
In these cases, we can combine test data and train data to check all the existing categories in the feature. Then use it to encoding eventhough the data
doesn't avaibale in the training data, our model will be robust.

This may cause overfitting but to overcome it priorly we should perform cross validation accordingly - Refer book pg:103

Have unseen categories in the validation set similar to the test data. So that when the model train with the traininig data and validated with the 
validation data it will learn to handle unseen categories - Important
"""

"\nIn these cases, we can combine test data and train data to check all the existing categories in the feature. Then use it to encoding eventhough the data\ndoesn't avaibale in the training data, our model will be robust.\n\nThis may cause overfitting but to overcome it priorly we should perform cross validation accordingly - Refer book pg:103\n\nHave unseen categories in the validation set similar to the test data. So that when the model train with the traininig data and validated with the \nvalidation data it will learn to handle unseen categories - Important\n"

In [42]:
# Simple example of performing this operation

#--- Refer code pg 104 ---#

### Now analysing the ord_4 column

In [66]:
df.ord_4.fillna("NAN", inplace=True)
df.ord_4.value_counts()

## Here there are 17930 NaN values as per the book and data

N      39978
P      37890
Y      36657
A      36633
R      33045
U      32897
M      32504
X      32347
C      32112
H      31189
Q      30145
T      29723
O      25610
B      25212
E      21871
K      21676
I      19805
NAN    17930
D      17284
F      16721
W       8268
Z       5790
S       4595
G       3404
V       3107
J       1950
L       1657
Name: ord_4, dtype: int64

In [67]:
## We are going to create a criteria/threshold for the rare category
## Here lets take the all the category count less than 2000 are mentioned as rare category

df.ord_4.fillna("NAN", inplace=True)

## We can access a specific category count as follows, So
df.ord_4.value_counts()["P"]

37890

In [78]:
way1 = (df.ord_4.value_counts() < 2000).values

In [79]:
way2 = df.ord_4.value_counts()[df["ord_4"]].values < 2000

In [87]:
df.loc[way2, "ord_4"] = "Rare"
df.ord_4.unique()  

array(['U', 'X', 'P', 'C', 'Q', 'R', 'Y', 'N', 'I', 'O', 'M', 'E', 'V',
       'K', 'G', 'B', 'H', 'NAN', 'T', 'W', 'A', 'F', 'D', 'S', 'Rare',
       'Z'], dtype=object)

In [95]:
len(way2)

600000

In [96]:
df.ord_4.value_counts()
#Here Rare is 3607 which is 1950 + 1657 (corresponding values of L and J)

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NAN     17930
D       17284
F       16721
W        8268
Z        5790
S        4595
Rare     3607
G        3404
V        3107
Name: ord_4, dtype: int64

## Now we have done two important preprocessing on both training and testing data
### Converting all the categories(less than 2000) - to "Rare" Category
### Considering all the missing values - as "None" category

#### As we do this on both training and testing data, this models works well in a live setting even if we have new categories

#### We can build model with the data

In [97]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,new_feature
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,3.0,0,4.0,c,U,Pw,6.0,3.0,0,0_4.0_c
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,3.0,2,1.0,e,X,pE,7.0,7.0,0,2_1.0_e
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,3.0,4,0.0,n,P,eN,5.0,9.0,0,4_0.0_n
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,1.0,5,5.0,a,C,,3.0,3.0,0,5_5.0_a
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,3.0,2,2.0,h,C,OZ,5.0,12.0,0,2_2.0_h


In [98]:
## Cross validation is important before start building our model
## We already know that the data is skewed so we are going with StratifiedKFold Cross validation
## Creating and managing a separate project for the chapter5