In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

#import mglearn
from IPython.display import display
from matplotlib import rc

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

In [3]:
data= pd.read_csv("color_survey_answers.csv",  sep='\t')

In [4]:
data.head()


Unnamed: 0,id,user_id,datestamp,r,g,b,colorname
0,1,1,1267419000.0,72,100,175,pastel blue
1,2,1,1267419000.0,204,177,246,faint violet
2,3,1,1267419000.0,182,226,245,baby blue
3,4,1,1267419000.0,130,64,234,purple
4,5,2,1267419000.0,75,49,234,blue


In [5]:
data.columns

Index(['id', 'user_id', 'datestamp', 'r', 'g', 'b', 'colorname'], dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3363127 entries, 0 to 3363126
Data columns (total 7 columns):
id           int64
user_id      int64
datestamp    float64
r            int64
g            int64
b            int64
colorname    object
dtypes: float64(1), int64(5), object(1)
memory usage: 179.6+ MB


In [7]:
#converting date from float to datetime
from datetime import datetime
data['datestamp'] = pd.to_datetime(data['datestamp'],unit='s')

In [8]:
# to check the null value in the dataset
print(data.isnull().values.sum())

1734


In [9]:
#data["colorname"].value_counts()

In [10]:
data = data.fillna({"colorname": "green"})

In [11]:
#extracting new colours into another dataframe from original

In [12]:
new_colour = data[data['colorname'] == 'impatiens juice'].copy()

In [13]:
new_colour = data[data['colorname'] == 'caribbean surf'].append(new_colour)

In [14]:
new_colour = data[data['colorname'] == 'medium tan leather'].append(new_colour)

In [15]:
new_colour = data[data['colorname'] == 'babypoo'].append(new_colour)

In [16]:
#new_colour.tail()

In [17]:
new_colour.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 185291 to 3170468
Data columns (total 7 columns):
id           8 non-null int64
user_id      8 non-null int64
datestamp    8 non-null datetime64[ns]
r            8 non-null int64
g            8 non-null int64
b            8 non-null int64
colorname    8 non-null object
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 512.0+ bytes


In [18]:
#increasing the occurance of new colours
new_colour = new_colour.append([new_colour]*20)

In [19]:
new_colour.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 185291 to 3170468
Data columns (total 7 columns):
id           168 non-null int64
user_id      168 non-null int64
datestamp    168 non-null datetime64[ns]
r            168 non-null int64
g            168 non-null int64
b            168 non-null int64
colorname    168 non-null object
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 10.5+ KB


In [20]:
#filtering the unwanted colours which have occurance of 20 or less
data = data.groupby('colorname').filter(lambda x: len(x) > 21)

In [21]:
#combining new colours to the filtered original dataset
data = pd.concat([data,new_colour],ignore_index=True)

In [22]:
data['colorname'].value_counts()

green                 311798
blue                  284220
purple                245832
pink                  129286
brown                  74782
red                    68985
light blue             57381
teal                   56336
orange                 51449
light green            49073
yellow                 43494
magenta                43139
grey                   35664
sky blue               34043
violet                 32407
lime green             30918
light purple           29221
turquoise              25925
lavender               25447
cyan                   24421
tan                    23710
dark green             23624
dark blue              23116
aqua                   23036
forest green           19112
mauve                  18899
bright green           18578
gray                   18003
olive                  17339
dark purple            16653
                       ...  
orchid pink               22
baby food green           22
bright purple-blue        22
violetish blue

In [23]:
#shuffling the dataset to have equal distribution of each colour in the dataset
#data= data.sample(frac = 1,random_state= 10).reset_index(drop = True)

In [24]:
##taking a subset of dataset
small_data = data.sample(2500,random_state= 10)

In [25]:
small_data.head()

Unnamed: 0,id,user_id,datestamp,r,g,b,colorname
2865370,3185369,141804,2010-03-31 03:30:54,197,80,165,mauve
662610,732457,33970,2010-03-03 19:01:05,222,78,201,purple
2692584,2993683,133589,2010-03-27 22:35:50,222,174,70,butternut squash
1282872,1431425,65598,2010-03-08 21:09:40,228,187,6,goldenrod
932594,1047073,47641,2010-03-05 15:35:28,123,185,243,sky blue


In [26]:
# on_hot_encoding
pd.get_dummies(small_data, columns=["colorname"], prefix = ['colorname']).head()

Unnamed: 0,id,user_id,datestamp,r,g,b,colorname_ blue,colorname_!,colorname_3,colorname_5,...,colorname_y,colorname_yello,colorname_yellow,colorname_yellow brown,colorname_yellow green,colorname_yellow orange,colorname_yellow-brown,colorname_yellow-green,colorname_yellow/green,colorname_yellowish green
2865370,3185369,141804,2010-03-31 03:30:54,197,80,165,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
662610,732457,33970,2010-03-03 19:01:05,222,78,201,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2692584,2993683,133589,2010-03-27 22:35:50,222,174,70,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1282872,1431425,65598,2010-03-08 21:09:40,228,187,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932594,1047073,47641,2010-03-05 15:35:28,123,185,243,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Split the data into features and target label
y = np.array(small_data['colorname'])
X = np.array(small_data.drop(['colorname', 'datestamp'], axis = 1))

In [28]:
# train and test splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [29]:
#creating pipeline with PCA and classifier
pca_rf = Pipeline([('preprocessing', StandardScaler()),('pca', PCA(n_components= 'mle')), ('classifier', RandomForestClassifier(max_features='auto',n_estimators=1500,random_state=47))])

In [30]:
param_grid = [
{'classifier': [RandomForestClassifier(max_features='auto',n_estimators=1500,random_state=47)],
'classifier__min_samples_leaf': [4,6,8], 'classifier__max_depth': [4,6,8],'classifier__max_leaf_nodes': [4,6,8]},
]

In [31]:
grid = GridSearchCV(pca_rf, param_grid, cv=2,n_jobs=-1, verbose=1)

grid.fit(X_train,y_train)

print("Accuracy: {:.2f}".format(grid.score(X_test, y_test)))


Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.1min finished


Accuracy: 0.34


In [32]:
# custom binary encoding

In [33]:
small_data['color_green'] = np.where(small_data['colorname'].str.contains('green'), 1, 0)

In [34]:
small_data['color_red'] = np.where(small_data['colorname'].str.contains('red'), 1, 0)

In [35]:
small_data['color_pink'] = np.where(small_data['colorname'].str.contains('pink'), 1, 0)

In [36]:
small_data['color_blue'] = np.where(small_data['colorname'].str.contains('blue'), 1, 0)

In [37]:
small_data['color_orange'] = np.where(small_data['colorname'].str.contains('orange'), 1, 0)

In [38]:
small_data['color_yellow'] = np.where(small_data['colorname'].str.contains('yellow'), 1, 0)

In [39]:
small_data['color_purple'] = np.where(small_data['colorname'].str.contains('purple'), 1, 0)

In [40]:
small_data['color_tan'] = np.where(small_data['colorname'].str.contains('tan'), 1, 0)

In [41]:
#small_data

In [42]:
# Split the data into features and target label
y1 = np.array(small_data['colorname'])
X1 = np.array(small_data.drop(['colorname', 'datestamp'], axis = 1))

In [43]:
# train and test splitting the data
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.33, random_state = 0)

In [44]:
grid = GridSearchCV(pca_rf, param_grid, cv=2,n_jobs=-1, verbose=1)

grid.fit(X1_train,y1_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.9min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='g...tors=1500, n_jobs=None,
            oob_score=False, random_state=47, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'classifier': [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=8,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min...ples_leaf': [4, 6, 8], 'classifier__max_depth': [4, 6, 8], 'classifier__max_leaf_nodes': [4, 6, 8]}],
       pre_dispatch='2*n_jobs', refit=T

In [45]:
print("Accuracy: {:.2f}".format(grid.score(X1_test, y1_test)))
print("Best parameters: {}".format(grid.best_params_))



Accuracy: 0.45
Best parameters: {'classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=8,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=None,
            oob_score=False, random_state=47, verbose=0, warm_start=False), 'classifier__max_depth': 4, 'classifier__max_leaf_nodes': 8, 'classifier__min_samples_leaf': 4}


In [46]:
pred = grid.predict(X1_test)

