In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [38]:
df=pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv',index_col='id')
testdf=pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv',index_col='id')

In [39]:
df.head()

In [40]:
X = df.drop(['target'],axis=1)
y = df['target']


In [41]:
from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)
#Remove high cardianility columns
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
testdf=testdf[my_cols]
X_valid = X_valid_full[my_cols].copy()


In [42]:
#extract object columns
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

In [43]:
print("Categorical variables:")
print(object_cols)

In [44]:
testdf.head()

## Ordinal Encoding

In [45]:
#Ordinal Encoding 
from sklearn.preprocessing import OrdinalEncoder
ordinalEncoder=OrdinalEncoder()

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
label_X_train[object_cols]=ordinalEncoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinalEncoder.transform(X_valid[object_cols])



In [46]:
from sklearn.linear_model import LogisticRegression

#LR with Ordinal Values
clf = LogisticRegression(max_iter=1000,verbose=1)
  
clf.fit(label_X_train,y_train)
print("Training score")
print(clf.score(label_X_train,y_train))
print("Validation score")
print(clf.score(label_X_valid,y_valid))


#confusion matrix


#print("Confusion Matrix")
from sklearn.metrics import confusion_matrix
#print("Test Set Confusion Matrix")
y_predT=clf.predict(label_X_train)
confusion_matrix(y_predT,y_train)

#print("Val set Confusion Matrix")
y_predV=clf.predict(label_X_valid)
confusion_matrix(y_predV,y_valid)



print("F1 score")
from sklearn.metrics import f1_score
print("Training F1 Score")                                 
print(f1_score(y_predT,y_train, average='weighted'))
print("Validation F1 score")                                  
print(f1_score(y_predV,y_valid, average='weighted'))                                 


## One Hot Encoding


In [47]:
#Onehot Encoding
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(testdf[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index =  testdf.index                         

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test  = testdf.drop(object_cols, axis=1)                            

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test =  pd.concat([num_X_test,   OH_cols_test], axis=1)                           


In [58]:
print(OH_X_train.shape)
print(OH_X_valid.shape)
print(OH_X_test.shape)
print(OH_X_test.head())

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#LR with OH encoding
clf2 = LogisticRegression(max_iter=900,verbose=1).fit(OH_X_train, y_train)
print("Training score")
print(clf2.score(OH_X_train,y_train))
print("Validation score")
print(clf2.score(OH_X_valid,y_valid))


#confusion matrix


#print("Confusion Matrix")
from sklearn.metrics import confusion_matrix
#print("Test Set Confusion Matrix")
y_predT=clf2.predict(OH_X_train)
#print(confusion_matrix(y_predT,y_train))

print("Val set Confusion Matrix")
y_predV=clf2.predict(OH_X_valid)
print(confusion_matrix(y_predV,y_valid))



print("F1 score")
from sklearn.metrics import f1_score
print("Training F1 Score")                                 
print(f1_score(y_predT,y_train, average='weighted'))
print("Validation F1 score")                                  
print(f1_score(y_predV,y_valid, average='weighted'))


In [51]:
OH_X_test.head()

In [65]:
testdf.index

In [52]:
X_test = pd.get_dummies(OH_X_test)

In [62]:
X_test_prep = clf2.predict(X_test)
print(X_test_prep.shape)

In [66]:
submission = pd.DataFrame({
    'id' : testdf.index,
    'target' : X_test_prep
})

In [68]:
submission.to_csv('Submission_log_reg3.csv', index = False)