In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


In [2]:
dataset = pd.read_csv("train.csv")
print(dataset.head())

         UID                ph_no    cvv  credit_card_number  \
0  T77962103           7059868985  673.0        2.131868e+14   
1  O92591533  +1-288-810-2425x013  983.0        4.820055e+15   
2  O31883571    791.502.4387x7276  855.0        4.031429e+15   
3  M18080565           2522308761  806.0        3.036423e+13   
4  Q74073854     683-521-2001x423  424.0        4.610207e+15   

                             job                      email  \
0                           Copy  guerrerodavid@example.org   
1  Interior and spatial designer     michaela47@example.org   
2       Scientist, water quality       zhoffman@example.org   
3                   Toxicologist         hprice@example.net   
4              Software engineer      michael61@example.net   

                         url                country  emoji             name  \
0         http://garcia.org/                  Gabon      📑     Cathy Cherry   
1          http://ortiz.com/                Bermuda     🥷🏿    Austin Graham   

In [3]:
dataset = dataset.drop(["UID","ph_no","job","country","cvv","credit_card_number","email","url","emoji","name"],axis =1)

In [4]:
def clean_columns(dataset):
    for column in dataset.columns[:-1]:
        dataset[column] = dataset[column].str.extract(r'(\-?\d+\.\d+)').astype(float)

In [5]:
clean_columns(dataset)

In [6]:
def correlation(dataset, threshold):
  col_cor = set()
  corr_matrix = dataset.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if abs(corr_matrix.iloc[i,j]) > threshold:
        colname = corr_matrix.columns[i]
        col_cor.add(colname)
  return col_cor

In [7]:
corr_features = correlation(dataset.drop(['state'], axis = 1), 0.7)

In [8]:
dataset = dataset.drop(corr_features,axis = 1)

In [9]:
for column in dataset.columns[:-1]:
    column_mean = dataset[column].mean()
    dataset[column].fillna(column_mean, inplace=True)

In [10]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [11]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,0:] = sc.fit_transform(x_train[:,0:])
x_test[:,0:] = sc.transform(x_test[:,0:])

In [13]:
from catboost import CatBoostClassifier


classifier = CatBoostClassifier()

classifier.fit(x_train, y_train)

Learning rate set to 0.100655
0:	learn: 2.2784973	total: 906ms	remaining: 15m 5s
1:	learn: 2.2555970	total: 1.64s	remaining: 13m 38s
2:	learn: 2.2348210	total: 2.35s	remaining: 12m 59s
3:	learn: 2.2148339	total: 3.13s	remaining: 13m
4:	learn: 2.1948313	total: 3.95s	remaining: 13m 5s
5:	learn: 2.1771687	total: 4.68s	remaining: 12m 54s
6:	learn: 2.1601099	total: 5.43s	remaining: 12m 50s
7:	learn: 2.1428793	total: 6.15s	remaining: 12m 42s
8:	learn: 2.1275824	total: 6.89s	remaining: 12m 39s
9:	learn: 2.1097129	total: 7.62s	remaining: 12m 34s
10:	learn: 2.0928880	total: 8.34s	remaining: 12m 30s
11:	learn: 2.0779458	total: 9.08s	remaining: 12m 27s
12:	learn: 2.0630080	total: 9.88s	remaining: 12m 29s
13:	learn: 2.0484208	total: 10.6s	remaining: 12m 29s
14:	learn: 2.0329028	total: 11.4s	remaining: 12m 28s
15:	learn: 2.0186833	total: 12.2s	remaining: 12m 27s
16:	learn: 2.0061055	total: 12.9s	remaining: 12m 25s
17:	learn: 1.9926686	total: 13.7s	remaining: 12m 28s
18:	learn: 1.9796577	total: 14.6

<catboost.core.CatBoostClassifier at 0x265c35786d0>

In [14]:
y_pred = classifier.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score,confusion_matrix

acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

In [31]:
print(acc)
print(cm)

0.8885357142857143
[[2527   32   28   39   32   23   35   27   33   32]
 [  53 2425   29   38   25   25   31   38   35   49]
 [  35   35 2443   49   47   47   20   35   27   46]
 [  36   28   39 2430   34   40   48   37   40   36]
 [  29   25   38   30 2597   39   38   34   25   26]
 [  31   32   33   30   44 2486   28   43   26   38]
 [  41   38   45   36   18   42 2532   29   34   26]
 [  41   38   36   38   28   27   45 2420   21   33]
 [  24   53   29   38   36   36   33   39 2473   21]
 [  53   27   34   34   31   65   27   27   26 2546]]


In [16]:
import joblib

model = classifier


joblib.dump(model, 'MIC_Catboost_updated.pkl')


['MIC_Catboost_updated.pkl']

In [17]:
test_dataset = pd.read_csv("test.csv")
print(test_dataset.head())

         UID                  ph_no     cvv  credit_card_number  \
0  675919160  001-869-364-3240x1461   632.0        4.787566e+12   
1  V09461652    +1-573-696-9623x435   548.0        4.804074e+15   
2  S75396644   001-755-901-1494x000   808.0        4.890170e+15   
3  598599835      (625)805-7487x931  9468.0        5.428366e+15   
4  W60397022     556-206-9662x97397   647.0        2.131239e+14   

                                     job                         email  \
0            Scientist, research (maths)  lawrencereginald@example.org   
1       Higher education careers adviser        jennifer41@example.org   
2                            Illustrator      valdezsheryl@example.net   
3                 Audiological scientist                           NaN   
4  Development worker, international aid    jefferyrussell@example.org   

                    url                           country  emoji  \
0    http://walker.org/                          Honduras  🧜🏻‍♀️   
1      http://sh

In [18]:
test_dataset = test_dataset.drop(["UID","ph_no","job","country","cvv","credit_card_number","email","url","emoji","name"],axis =1)

In [19]:
def clean_columns_new(dataset):
    for column in dataset.columns[0:]:
        dataset[column] = dataset[column].str.extract(r'(\-?\d+\.\d+)').astype(float)

In [20]:
clean_columns_new(test_dataset)

In [21]:
test_dataset = test_dataset.drop(corr_features,axis = 1)

In [22]:
for column in test_dataset.columns[0:]:
    column_mean = test_dataset[column].mean()
    test_dataset[column].fillna(column_mean, inplace=True)

In [23]:
x_test_new = test_dataset.iloc[:,0:].values


In [24]:
x_test_new[:,0:] = sc.transform(x_test_new[:,0:])

In [25]:
y_test_pred = classifier.predict(x_test_new)


In [26]:
submission_dataset = pd.read_csv("test.csv")

In [27]:
final_predictions = np.array(y_test_pred).flatten()

In [28]:
result_df = pd.DataFrame({'UID': submission_dataset['UID'], 'state': final_predictions})


result_df.to_csv('MIC_submission_updated.csv', index=False)

In [29]:
importance_scores = classifier.feature_importances_
print(importance_scores)

[1.1665812  0.5655304  0.90282824 0.70099141 1.62950736 0.79926714
 1.06879334 1.18974074 1.45197795 1.02616788 1.81906348 1.59042649
 1.83669119 0.13459653 1.39742984 1.42153155 1.52227626 1.10198682
 0.7909287  1.20826112 0.9282047  1.72035588 1.37200079 1.13973814
 0.08588114 0.72169018 0.69503462 0.92640784 0.49610392 1.80411155
 0.7499249  0.95598928 0.69502177 1.26967225 1.12684655 0.82138587
 0.6844699  1.63793397 1.3849382  1.60535658 1.30481839 0.6582833
 0.92210539 1.79975766 1.65583017 0.85553074 0.13061895 1.10649543
 1.51624053 1.16567879 0.59626804 1.57372051 0.86815202 1.49273674
 1.43960841 1.42398449 0.03241417 0.55723034 1.23283897 1.40977836
 1.49262926 1.33683143 1.69941481 0.99548893 0.67322953 1.8297202
 1.43809055 1.44599512 1.59566632 1.79085299 1.22238755 0.7901549
 1.41701632 1.38374015 0.92608065 0.77956385 0.05814271 1.41604778
 0.16974243 0.68956167 0.06789134 0.92741887 0.00922524 0.64104917
 0.92030208 0.51551794 1.23015365 1.47951526 1.60970204 1.4618166

In [35]:
column_names = test_dataset.columns.tolist()


92


In [43]:
l1 = [[a,b] for a,b in zip(column_names,importance_scores)]
print(l1)
for i in l1:
    if i[1] <0.05:
        print(i)

[['K3Ll9', 1.166581196371649], ['19rjS', 0.5655303981215882], ['yeIIP', 0.9028282447193023], ['Bw1V5', 0.7009914140470077], ['5k16L', 1.6295073604915633], ['e2l5S', 0.7992671366627592], ['cg31y', 1.068793340569279], ['8SVMv', 1.1897407350742115], ['Xsi3p', 1.4519779514169975], ['l8Y6n', 1.026167882066522], ['vRJwh', 1.8190634822101241], ['CTzXJ', 1.5904264873606713], ['13SMK', 1.8366911854743058], ['WUuos', 0.13459653152225917], ['UyaHP', 1.3974298412050876], ['i6ebQ', 1.4215315545683387], ['ciVeL', 1.5222762615752858], ['fOHiM', 1.1019868245039681], ['1AFO4', 0.7909287018606699], ['b0QUS', 1.2082611230982514], ['vgNLa', 0.9282046964010793], ['EjWMB', 1.7203558795374556], ['Z9KvZ', 1.3720007934424678], ['LsnIb', 1.1397381356695173], ['kMpCl', 0.0858811356452463], ['gomep', 0.721690183270557], ['1Jfzv', 0.6950346228177423], ['piHSs', 0.9264078420885593], ['SQLeJ', 0.49610392342742204], ['184Wa', 1.8041115539507186], ['HTOAr', 0.7499249047489653], ['gItG5', 0.9559892770370124], ['K7fG4',