In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [3]:
# saving id of testset for future use
id = test_df.id

In [4]:
train_df.head()

Unnamed: 0,id,realtionship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
0,19717,Married-spouse-absent,Auto,Comedy,Male,45,Primetime,United-States,0.027465,High,No,False
1,31593,Married-civ-spouse,Pharma,Comedy,Male,45,Primetime,United-States,0.027465,Low,No,False
2,5681,Divorced,Entertainment,Comedy,Female,45,Primetime,United-States,0.027465,High,Yes,False
3,15491,Separated,Political,Infomercial,Female,40,Primetime,United-States,0.027465,Low,No,False
4,23587,Married-civ-spouse,Pharma,Comedy,Male,48,Primetime,United-States,0.027465,High,No,True


In [5]:
# checking for null values
train_df.isna().sum()

id                                   0
realtionship_status                  0
industry                             0
genre                                0
targeted_sex                         0
average_runtime(minutes_per_week)    0
airtime                              0
airlocation                          0
ratings                              0
expensive                            0
money_back_guarantee                 0
netgain                              0
dtype: int64

In [6]:
set(zip(train_df.airlocation.value_counts().sort_index().index,test_df.airlocation.value_counts().sort_index().index))

{('Cambodia', 'Cambodia'),
 ('Canada', 'Canada'),
 ('China', 'China'),
 ('Columbia', 'Columbia'),
 ('Cuba', 'Cuba'),
 ('Dominican-Republic', 'Dominican-Republic'),
 ('Ecuador', 'Ecuador'),
 ('El-Salvador', 'El-Salvador'),
 ('England', 'England'),
 ('France', 'France'),
 ('Germany', 'Germany'),
 ('Greece', 'Greece'),
 ('Guatemala', 'Guatemala'),
 ('Haiti', 'Haiti'),
 ('Holand-Netherlands', 'Honduras'),
 ('Honduras', 'Hong'),
 ('Hong', 'Hungary'),
 ('Hungary', 'India'),
 ('India', 'International'),
 ('International', 'Iran'),
 ('Iran', 'Ireland'),
 ('Ireland', 'Italy'),
 ('Italy', 'Jamaica'),
 ('Jamaica', 'Japan'),
 ('Japan', 'Laos'),
 ('Laos', 'Mexico'),
 ('Mexico', 'Nicaragua'),
 ('Nicaragua', 'Outlying-US(Guam-USVI-etc)'),
 ('Outlying-US(Guam-USVI-etc)', 'Peru'),
 ('Peru', 'Philippines'),
 ('Philippines', 'Poland'),
 ('Poland', 'Portugal'),
 ('Portugal', 'Puerto-Rico'),
 ('Puerto-Rico', 'Scotland'),
 ('Scotland', 'South'),
 ('South', 'Taiwan'),
 ('Taiwan', 'Thailand'),
 ('Thailand', '

From the above pairs of airlocation in training and testing set, we can observe that 'Holand-Netherlands' is missing in the testing set. While performing OneHotEncoder or Pandas' get dummies, the features in training and testing set will mismatch and create some overhead during prediction. So it is better to drop rows with airlocation as 'Holand-netherlands' in the training set

In [7]:
train_df.loc[train_df.airlocation=='Holand-Netherlands',:]

Unnamed: 0,id,realtionship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
8494,32363,Never-married,ClassAction,Comedy,Male,30,Morning,Holand-Netherlands,0.027465,Low,No,False
25764,17205,Never-married,ClassAction,Comedy,Female,24,Morning,Holand-Netherlands,0.027465,Low,No,False


There are only 2 rows with airlocation 'Holand-Netherlands' in the training set. Droping 2 rows from the training set doesn't affect that much. So, we are going to drop those rows using their id.

In [8]:
train_df = train_df.drop(train_df.index[[8494,25764]])

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
to_be_labeled_train = ['realtionship_status','industry','genre','targeted_sex','airtime','airlocation','expensive','money_back_guarantee','netgain']
for column in to_be_labeled_train:
    train_df[column] = le.fit_transform(train_df[column])

to_be_labeled_test = ['realtionship_status','industry','genre','targeted_sex','airtime','airlocation','expensive','money_back_guarantee']
for column in to_be_labeled_test:
    test_df[column] = le.fit_transform(test_df[column])

In [10]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
columns = ['average_runtime(minutes_per_week)','ratings']
for col in columns:
    train_df[col+'_scaled'] = scale.fit_transform(np.array(train_df[col]).reshape(-1, 1))
    test_df[col+'_scaled'] = scale.fit_transform(np.array(test_df[col]).reshape(-1, 1))

In [11]:
train_df.drop(columns,axis=1,inplace=True)
train_df = train_df.drop(['id'],axis=1)
test_df.drop(columns,axis=1,inplace=True)
test_df = test_df.drop(['id'],axis=1)

In [12]:
train_df

Unnamed: 0,realtionship_status,industry,genre,targeted_sex,airtime,airlocation,expensive,money_back_guarantee,netgain,average_runtime(minutes_per_week)_scaled,ratings_scaled
0,3,0,0,1,2,38,0,0,0,0.377019,-0.148336
1,2,4,0,1,2,38,1,0,0,0.377019,-0.148336
2,0,2,0,0,2,38,0,1,0,0.377019,-0.148336
3,5,5,3,0,2,38,1,0,0,-0.023650,-0.148336
4,2,4,0,1,2,38,0,0,1,0.617421,-0.148336
...,...,...,...,...,...,...,...,...,...,...,...
26043,2,4,0,1,2,38,1,0,1,0.777688,-0.148336
26044,4,0,0,0,1,38,1,0,0,-0.023650,-0.148336
26045,2,3,0,0,2,38,1,0,1,-1.225657,0.837376
26046,4,0,0,0,1,38,1,1,0,0.617421,-0.148336


In [13]:
train_df = pd.get_dummies(train_df,columns=['realtionship_status','industry','genre','targeted_sex','airtime','airlocation','expensive','money_back_guarantee'],drop_first=True)
test_df = pd.get_dummies(test_df,columns=['realtionship_status','industry','genre','targeted_sex','airtime','airlocation','expensive','money_back_guarantee'],drop_first=True)

In [14]:
train_df

Unnamed: 0,netgain,average_runtime(minutes_per_week)_scaled,ratings_scaled,realtionship_status_1,realtionship_status_2,realtionship_status_3,realtionship_status_4,realtionship_status_5,realtionship_status_6,industry_1,...,airlocation_34,airlocation_35,airlocation_36,airlocation_37,airlocation_38,airlocation_39,airlocation_40,expensive_1,expensive_2,money_back_guarantee_1
0,0,0.377019,-0.148336,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0.377019,-0.148336,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0.377019,-0.148336,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,-0.023650,-0.148336,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,0.617421,-0.148336,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,1,0.777688,-0.148336,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
26044,0,-0.023650,-0.148336,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
26045,1,-1.225657,0.837376,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
26046,0,0.617421,-0.148336,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,1


In [15]:
X = train_df.drop(['netgain'],axis=1)
y = train_df.netgain

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 1000, random_state = 0)
rf_classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
y_pred = rf_classifier.predict(test_df)

In [18]:
final_df = pd.DataFrame({'id': id, 'netgain': y_pred})

In [19]:
final_df['netgain'] = final_df['netgain'].map({0:False,1:True})

In [20]:
final_df

Unnamed: 0,id,netgain
0,1,False
1,4,False
2,5,False
3,9,False
4,10,False
...,...,...
6508,32538,False
6509,32542,False
6510,32549,False
6511,32558,False


In [21]:
final_df.to_csv('Results.csv',index=False)

In [22]:
pd.read_csv('Results.csv').head(10)

Unnamed: 0,id,netgain
0,1,False
1,4,False
2,5,False
3,9,False
4,10,False
5,20,False
6,28,False
7,31,False
8,32,False
9,34,False
