In [104]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model, decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score


%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
dft = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
dfs = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
df.shape

In [None]:
dft.shape

In [None]:
df.columns

In [None]:
dft.columns

In [None]:
df.head(3)

In [None]:
df.info()

In [3]:
X=df.loc[:,'feature_0':'feature_49']

In [4]:
y=df.loc[:,'target']

In [5]:
y=np.ravel(y)

In [None]:
X.describe()

###### Train

In [None]:
for col in df.columns:
    print(df[col].value_counts())

###### Test

In [None]:
for col in dft.columns:
    print(dft[col].value_counts())

In [None]:
cor_matrix=df.corr(method="pearson")
cor_matrix

In [None]:
for i in X.columns:
    sns.histplot(X[i])
    plt.show()

In [6]:
label=LabelEncoder()
y=label.fit_transform(y)

In [7]:
y

array([1, 0, 0, ..., 2, 1, 2])

# TRAIN AND TEST SPLIT

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 50)

In [11]:
cv=StratifiedKFold(n_splits=5)

# CREATING TWO PIPELINES FOR LOGISTIC AND RANDOM FOREST

In [81]:
estimator_1=[( 'Scalar1',StandardScaler()),('pcal1',PCA(n_components=30)),
             ('cls1', RandomForestClassifier())] #created the input for pipelines

pipe_1= Pipeline(estimator_1) #first pipe line created


In [None]:
pipe_1.fit(X_train, y_train)
p1_cv_scores = cross_val_score(pipe_1, X_train, y_train, cv=cv)

print('Pipe_1_train_Score,{},testScore,{}'.format(pipe_1.score(X_train, y_train),pipe_1.score(X_test, y_test)))
print('Pipe_1_cvmean,{},pipe_1_cvstd,{}'.format(np.mean(p1_cv_scores), np.std(p1_cv_scores)))

In [113]:
pca=PCA()
logistic = LogisticRegression(max_iter=10000, tol=0.1)

logistic = LogisticRegression(max_iter=10000, tol=0.1)
pipe_2 = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [114]:
pipe_2.fit(X_train, y_train)
p2_cv_scores = cross_val_score(pipe_2, X_train, y_train, cv=cv)

print('Pipe_2_train_Score,{},testScore,{}'.format(pipe_2.score(X_train, y_train),pipe_2.score(X_test, y_test)))
print('Pipe_2_cvmean,{},pipe_2_cvstd,{}'.format(np.mean(p2_cv_scores), np.std(p2_cv_scores)))

Pipe_2_train_Score,0.574025,testScore,0.57935
Pipe_2_cvmean,0.5739625,pipe_2_cvstd,0.0002999999999999799


# GRID SEARCH _PARAMETER OPTIMIZATION

In [115]:
param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64],
    'logistic__C': np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe_2, param_grid, n_jobs=-1)
search.fit(X_train, y_train)

 0.573925  0.5741625       nan 0.574     0.5739875 0.5739375 0.5741625
       nan 0.574     0.5739875 0.57395   0.5741625       nan]


GridSearchCV(estimator=Pipeline(steps=[('pca', PCA()),
                                       ('logistic',
                                        LogisticRegression(max_iter=10000,
                                                           tol=0.1))]),
             n_jobs=-1,
             param_grid={'logistic__C': array([1.00000000e-04, 4.64158883e-02, 2.15443469e+01, 1.00000000e+04]),
                         'pca__n_components': [5, 15, 30, 45, 64]})

In [116]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.574):
{'logistic__C': 0.046415888336127774, 'pca__n_components': 45}


# PREDICTION

In [117]:
X_pred=dft.loc[:,'feature_0':'feature_49']

In [118]:
predict = pipe_2.predict(X_pred)
print(predict)

[1 1 1 ... 1 1 1]


In [119]:
probability_predict = pipe_2.predict_proba(X_pred)
probability_predict

array([[0.09817622, 0.61242713, 0.17005713, 0.11933951],
       [0.09897637, 0.56029506, 0.20524468, 0.13548389],
       [0.08144702, 0.6343958 , 0.16868057, 0.11547661],
       ...,
       [0.08385618, 0.55420908, 0.21069443, 0.15124031],
       [0.0791596 , 0.5974639 , 0.16487185, 0.15850465],
       [0.08486735, 0.61274643, 0.16535291, 0.13703331]])

In [120]:
dfs.iloc[:,1:5] = probability_predict
dfs

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.098176,0.612427,0.170057,0.119340
1,100001,0.098976,0.560295,0.205245,0.135484
2,100002,0.081447,0.634396,0.168681,0.115477
3,100003,0.088859,0.516482,0.297382,0.097277
4,100004,0.071755,0.587836,0.195050,0.145359
...,...,...,...,...,...
49995,149995,0.092162,0.714239,0.123475,0.070124
49996,149996,0.084123,0.608014,0.154853,0.153010
49997,149997,0.083856,0.554209,0.210694,0.151240
49998,149998,0.079160,0.597464,0.164872,0.158505


In [121]:
dfs.to_csv('submission_log_2.csv',index=False)

In [None]:
dfs.to_csv('tps_submission_log.csv',index=False)