In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [3]:
train_df.head()

Unnamed: 0,customer_id,customer_visit_score,customer_product_search_score,customer_ctr_score,customer_stay_score,customer_frequency_score,customer_product_variation_score,customer_order_score,customer_affinity_score,customer_active_segment,X1,customer_category
0,csid_1,13.168425,9.447662,-0.070203,-0.139541,0.436956,4.705761,2.537985,7.959503,C,F,0
1,csid_2,17.092979,7.329056,0.153298,-0.102726,0.38034,4.205138,4.193444,17.517381,C,A,0
2,csid_3,17.505334,5.143676,0.106709,0.262834,0.417648,4.47907,3.878971,12.595155,C,BA,0
3,csid_4,31.423381,4.91774,-0.020226,-0.100526,0.77813,5.055535,2.70894,4.795073,AA,F,0
4,csid_5,11.909502,4.237073,0.187178,0.172891,0.162067,3.445247,3.67736,56.636326,C,AA,0


In [56]:
def impute_categoriel_nullvalues_with_most_common(df):
    df['customer_active_segment'].fillna(df['customer_active_segment'].mode().iloc[0],inplace=True)
    df['X1'].fillna(df['X1'].mode().iloc[0],inplace=True)
    return df

In [57]:
train_df = impute_categoriel_nullvalues_with_most_common(train_df)
test_df = impute_categoriel_nullvalues_with_most_common(test_df)

In [58]:
def oneHot(df):
    encode_df = pd.DataFrame(df, columns=["customer_active_segment","X1"])
    dumm_df = pd.get_dummies(encode_df, columns=["customer_active_segment","X1"], prefix=["CustomerSegment","Loyality"] )
    df = df.join(dumm_df)
    dumm_df = pd.get_dummies(encode_df, columns=["customer_active_segment","X1"], prefix=["CustomerSegment","Loyality"] )
    df.drop(columns=['customer_active_segment','X1'],inplace=True)
    return df

In [59]:
train_df = oneHot(train_df)
test_df = oneHot(test_df)

In [60]:
train_df = train_df.iloc[:,1:]
cust_id = test_df['customer_id']
test_df = test_df.iloc[:,1:]

In [61]:
def imputer(data):
    imp_mean = IterativeImputer(estimator=RandomForestRegressor(), random_state=0,verbose=2)
    imp_mean.fit(data)
    return pd.DataFrame(imp_mean.transform(data))

In [62]:
knnimputer = KNNImputer(n_neighbors=1000)

def knnimputerfunc(data):
    data = knnimputer.fit_transform(data)
    return pd.DataFrame(data)

In [63]:
def selectImputer(chooseImputer,data):
    if chooseImputer == 'KNN':
        return knnimputerfunc(data)
    else:
        return imputer(data)

In [66]:
train_df_nonull= selectImputer("RFR",train_df)
test_df_nonull= selectImputer("RFR",test_df)

[IterativeImputer] Completing matrix with shape (10738, 19)
[IterativeImputer] Ending imputation round 1/10, elapsed time 68.26
[IterativeImputer] Change: 6.969800034348991, scaled tolerance: 0.24855275470161067 
[IterativeImputer] Ending imputation round 2/10, elapsed time 137.99
[IterativeImputer] Change: 0.7525356628919946, scaled tolerance: 0.24855275470161067 
[IterativeImputer] Ending imputation round 3/10, elapsed time 207.89
[IterativeImputer] Change: 0.3745177206152519, scaled tolerance: 0.24855275470161067 
[IterativeImputer] Ending imputation round 4/10, elapsed time 278.35
[IterativeImputer] Change: 0.4361496654929164, scaled tolerance: 0.24855275470161067 
[IterativeImputer] Ending imputation round 5/10, elapsed time 357.13
[IterativeImputer] Change: 0.3107685890760816, scaled tolerance: 0.24855275470161067 
[IterativeImputer] Ending imputation round 6/10, elapsed time 430.65
[IterativeImputer] Change: 0.1998603499116154, scaled tolerance: 0.24855275470161067 
[IterativeIm

In [67]:
train_df_nonull.columns=train_df.columns
test_df_nonull.columns=test_df.columns

In [68]:
y = train_df["customer_category"]
X = train_df_nonull.iloc[:,:-1]
X_test = test_df_nonull

In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test.astype(float))

In [69]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, X, y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.98696462 0.97951583 0.97765363 0.97392924 0.97951583 0.98324022
 0.97951583 0.97858473 0.97670084 0.97763281]


In [70]:
round(np.mean(score)*100, 2)

97.93

In [71]:
clf.fit(X,y)
predictions = clf.predict(X_test)

In [72]:
output = pd.DataFrame({'customer_id': cust_id, 'customer_category': predictions})
output.to_csv('dataset/my_final_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
