In [29]:
import numpy as np
import pandas as pd
import seaborn as sns


In [30]:
dataset = pd.read_csv("train.csv")
print(dataset.head())

         UID                ph_no    cvv  credit_card_number  \
0  T77962103           7059868985  673.0        2.131868e+14   
1  O92591533  +1-288-810-2425x013  983.0        4.820055e+15   
2  O31883571    791.502.4387x7276  855.0        4.031429e+15   
3  M18080565           2522308761  806.0        3.036423e+13   
4  Q74073854     683-521-2001x423  424.0        4.610207e+15   

                             job                      email  \
0                           Copy  guerrerodavid@example.org   
1  Interior and spatial designer     michaela47@example.org   
2       Scientist, water quality       zhoffman@example.org   
3                   Toxicologist         hprice@example.net   
4              Software engineer      michael61@example.net   

                         url                country  emoji             name  \
0         http://garcia.org/                  Gabon      📑     Cathy Cherry   
1          http://ortiz.com/                Bermuda     🥷🏿    Austin Graham   

In [31]:
dataset = dataset.drop(["UID","ph_no","job","UZRdX","Bz7Ov","country","cvv","credit_card_number","email","url","emoji","name"],axis =1)

In [32]:
def clean_columns(dataset):
    for column in dataset.columns[:-1]:
        dataset[column] = dataset[column].str.extract(r'(\-?\d+\.\d+)').astype(float)

In [33]:
clean_columns(dataset)

In [34]:
def correlation(dataset, threshold):
  col_cor = set()
  corr_matrix = dataset.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if abs(corr_matrix.iloc[i,j]) > threshold:
        colname = corr_matrix.columns[i]
        col_cor.add(colname)
  return col_cor

In [35]:
corr_features = correlation(dataset.drop(['state'], axis = 1), 0.7)

In [36]:
dataset = dataset.drop(corr_features,axis = 1)

In [37]:
for column in dataset.columns[:-1]:
    column_mean = dataset[column].mean()
    dataset[column].fillna(column_mean, inplace=True)

In [38]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [39]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=42)

In [40]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,0:] = sc.fit_transform(x_train[:,0:])
x_test[:,0:] = sc.transform(x_test[:,0:])

In [41]:
from catboost import CatBoostClassifier


classifier = CatBoostClassifier()

classifier.fit(x_train, y_train)

Learning rate set to 0.101261
0:	learn: 2.2788355	total: 942ms	remaining: 15m 41s
1:	learn: 2.2553259	total: 1.93s	remaining: 16m 2s
2:	learn: 2.2351885	total: 2.89s	remaining: 16m 1s
3:	learn: 2.2134733	total: 4.05s	remaining: 16m 49s
4:	learn: 2.1926306	total: 5.19s	remaining: 17m 12s
5:	learn: 2.1739652	total: 6.36s	remaining: 17m 34s
6:	learn: 2.1551582	total: 7.43s	remaining: 17m 34s
7:	learn: 2.1367108	total: 8.3s	remaining: 17m 9s
8:	learn: 2.1206965	total: 9.16s	remaining: 16m 48s
9:	learn: 2.1055113	total: 10.1s	remaining: 16m 39s
10:	learn: 2.0902578	total: 11s	remaining: 16m 33s
11:	learn: 2.0742228	total: 11.9s	remaining: 16m 18s
12:	learn: 2.0608411	total: 12.7s	remaining: 16m 7s
13:	learn: 2.0465803	total: 13.7s	remaining: 16m 5s
14:	learn: 2.0311924	total: 14.6s	remaining: 15m 58s
15:	learn: 2.0172480	total: 15.4s	remaining: 15m 49s
16:	learn: 2.0038039	total: 16.3s	remaining: 15m 43s
17:	learn: 1.9902427	total: 17.2s	remaining: 15m 40s
18:	learn: 1.9768438	total: 18.1s	

<catboost.core.CatBoostClassifier at 0x1c129ad2fd0>

In [42]:
y_pred = classifier.predict(x_test)

In [43]:
from sklearn.metrics import accuracy_score,confusion_matrix

acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

print(acc)
print(cm)

0.8879285714285714
[[1232   10   13   20   20   14   21   16   19   13]
 [  23 1242   12   15   14   15   17   18   20   27]
 [  16   18 1175   17   24   29    6   20   16   18]
 [  17   14   18 1205   18   18   23   20   23   21]
 [  15   14   10   14 1328   18   15   11   16   11]
 [  14   14   14   17   28 1234   13   18   11   25]
 [  21   21   22   20   10   23 1301   13   16   14]
 [  24   14   19   16   10   18   24 1215   12   13]
 [  17   25   25   29   22   16   19   20 1227   12]
 [  24    9   16   16   11   37   13   11   16 1272]]


In [44]:
import joblib

model = classifier


joblib.dump(model, 'MIC_Catboost_updated.pkl')


['MIC_Catboost_updated.pkl']

In [45]:
test_dataset = pd.read_csv("test.csv")
print(test_dataset.head())

         UID                  ph_no     cvv  credit_card_number  \
0  675919160  001-869-364-3240x1461   632.0        4.787566e+12   
1  V09461652    +1-573-696-9623x435   548.0        4.804074e+15   
2  S75396644   001-755-901-1494x000   808.0        4.890170e+15   
3  598599835      (625)805-7487x931  9468.0        5.428366e+15   
4  W60397022     556-206-9662x97397   647.0        2.131239e+14   

                                     job                         email  \
0            Scientist, research (maths)  lawrencereginald@example.org   
1       Higher education careers adviser        jennifer41@example.org   
2                            Illustrator      valdezsheryl@example.net   
3                 Audiological scientist                           NaN   
4  Development worker, international aid    jefferyrussell@example.org   

                    url                           country  emoji  \
0    http://walker.org/                          Honduras  🧜🏻‍♀️   
1      http://sh

In [46]:
test_dataset = test_dataset.drop(["UID","ph_no","job","UZRdX","Bz7Ov","country","cvv","credit_card_number","email","url","emoji","name"],axis =1)

In [47]:
def clean_columns_new(dataset):
    for column in dataset.columns[0:]:
        dataset[column] = dataset[column].str.extract(r'(\-?\d+\.\d+)').astype(float)

In [48]:
clean_columns_new(test_dataset)

In [49]:
test_dataset = test_dataset.drop(corr_features,axis = 1)

In [50]:
for column in test_dataset.columns[0:]:
    column_mean = test_dataset[column].mean()
    test_dataset[column].fillna(column_mean, inplace=True)

In [51]:
x_test_new = test_dataset.iloc[:,0:].values


In [52]:
x_test_new[:,0:] = sc.transform(x_test_new[:,0:])

In [53]:
y_test_pred = classifier.predict(x_test_new)


In [54]:
submission_dataset = pd.read_csv("test.csv")

In [55]:
final_predictions = np.array(y_test_pred).flatten()

In [56]:
result_df = pd.DataFrame({'UID': submission_dataset['UID'], 'state': final_predictions})


result_df.to_csv('MIC_submission_updated_2.csv', index=False)