In [40]:
#Header
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.binary import BinaryEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

#Importing the datasets
test_data = pd.read_csv('D:\Jathin\jathin\jathin\ML Project\P1 Data\Consumer_Complaints_test_share.csv')
train_data = pd.read_csv('D:\Jathin\jathin\jathin\ML Project\P1 Data\Consumer_Complaints_train.csv')

# Exploratory Data Analysis

test_shape = test_data.shape
train_shape = train_data.shape
print("Shape of test data = ", test_shape)
print("Shape of train data = ", train_shape)
print()
print()

print("Train Data Information:")
print(train_data.info())
print()
print()

print("The percentage of null data in the train dataset:")
print(train_data.isnull().sum() / train_data.shape[0] * 100)

cat_col = list(train_data.select_dtypes(['object']).columns)
cat_col.remove('Date received')
cat_col.remove('Date sent to company')
cat_col.remove('Consumer disputed?')

for col in cat_col:
    print(col, ': train:', train_data[col].nunique(), ' test:', test_data[col].nunique())

train_copy = train_data.copy()
test_copy = test_data.copy()
train_data = train_data.set_index('Complaint ID')
test_data = test_data.set_index('Complaint ID')

for col in test_data.columns:
    varname = col.replace('-', '_').replace('?', '').replace(" ", '_') + '_isNan'
    train_data[varname] = np.where(pd.isnull(train_data[col]), 1, 0)
    test_data[varname] = np.where(pd.isnull(test_data[col]), 1, 0)

train_data['Date received'] = pd.to_datetime(train_data['Date received'], infer_datetime_format=True)
test_data['Date received'] = pd.to_datetime(test_data['Date received'], infer_datetime_format=True)
train_data['Date sent to company'] = pd.to_datetime(train_data['Date sent to company'], infer_datetime_format=True)
test_data['Date sent to company'] = pd.to_datetime(test_data['Date sent to company'], infer_datetime_format=True)

def fe_date(df, column, arg_list):
    for arg in arg_list:
        df[column + '_' + arg] = df[column].dt.__getattribute__(arg)

def fe_date_diff(df, arg_list):
    for i, arg_pair in enumerate(arg_list):
        df['date_diff_' + str(i)] = (df[arg_pair[1]] - df[arg_pair[0]]).dt.days

fe_date(train_data, 'Date received', ['day', 'month', 'year'])
fe_date(test_data, 'Date received', ['day', 'month', 'year'])
fe_date(train_data, 'Date sent to company', ['day', 'month','year'])
fe_date(test_data, 'Date sent to company', ['day', 'month', 'year'])

fe_date_diff(train_data, [['Date received', 'Date sent to company']])
fe_date_diff(test_data, [['Date received', 'Date sent to company']])

lab_enc = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company public response', 'Company', 'State', 'Tags',
           'Consumer consent provided?', 'Submitted via', 'Company response to consumer', 'Timely response?']
lab_enc_dict = {}

for col in lab_enc:
    lab_enc_dict[col] = {}
    for x, y in enumerate(train_data[col].unique()):
        lab_enc_dict[col][y] = x

    train_data[col] = train_data[col].map(lab_enc_dict[col])
    test_data[col] = train_data[col].map(lab_enc_dict[col])

train_data['ZIP code'] = pd.to_numeric(train_data['ZIP code'].str.slice(0, 3), errors='coerce')
test_data['ZIP code'] = pd.to_numeric(test_data['ZIP code'].str.slice(0, 3), errors='coerce')


def static_clm_creation(df, column, value):
    df[column] = value


def fe_encoding(df_train, df_test, column, encoding_type):
    if encoding_type == 'ohe':
        ohe = OneHotEncoder(cols=[column])
        df_train = ohe.fit_transform(df_train)
        df_train = df_train.drop(column + '_1', axis=1)

        df_test = ohe.transform(df_test)
        df_test = df_test.drop(column + '_1', axis=1)
        return df_train, df_test

    if encoding_type == 'oe':
        oe = OrdinalEncoder(cols=[column])
        df_train = oe.fit_transform(df_train)

        df_test = oe.transform(df_test)
        return df_train, df_test

    if encoding_type == 'be':
        oe = BinaryEncoder(cols=[column])
        df_train = oe.fit_transform(df_train)

        df_test = oe.transform(df_test)
        return df_train, df_test


train_data_final = train_data.drop(['Date received', 'Consumer complaint narrative', 'Date sent to company'], axis=1)
test_data_final = test_data.drop(['Date received', 'Consumer complaint narrative', 'Date sent to company'], axis=1)




Shape of test data =  (119606, 17)
Shape of train data =  (478421, 18)


Train Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 478421 non-null  object
 1   Product                       478421 non-null  object
 2   Sub-product                   339948 non-null  object
 3   Issue                         478421 non-null  object
 4   Sub-issue                     185796 non-null  object
 5   Consumer complaint narrative  75094 non-null   object
 6   Company public response       90392 non-null   object
 7   Company                       478421 non-null  object
 8   State                         474582 non-null  object
 9   ZIP code                      474573 non-null  object
 10  Tags                          67206 non-null   object
 11  Consumer consent pro

In [41]:
# Combine the train and test datasets
combined_data = pd.concat([train_data_final, test_data_final], axis=0)

# Create a SimpleImputer with the most frequent strategy
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the combined dataset
combined_data_imputed = pd.DataFrame(imputer.fit_transform(combined_data), columns=combined_data.columns)

# Split the combined dataset back into train and test datasets
train_data_final = combined_data_imputed.iloc[:train_data_final.shape[0]]
test_data_final = combined_data_imputed.iloc[train_data_final.shape[0]:]

# Convert the target variable to numeric
train_data_final['Consumer disputed?'] = train_data_final['Consumer disputed?'].map({'No': 0, 'Yes': 1})

# Separate X and y
y = train_data_final['Consumer disputed?']
X = train_data_final.drop(['Consumer disputed?'], axis=1)

# Splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Model Training
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Cross Validation
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='roc_auc')
# Print the cross-validation scores for each fold
print("Cross-validation scores:", scores)
# Print the mean cross-validation score
print("Mean AUC:", scores.mean())

Cross-validation scores: [0.63108084 0.63017643 0.62956565 0.62848979 0.63405991]
Mean AUC: 0.6306745224009982


In [24]:
# Import libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the categorical columns
categorical_columns = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company public response', 'Company',
                       'State', 'Tags', 'Consumer consent provided?', 'Submitted via',
                       'Company response to consumer', 'Timely response?']

# Apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer([('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
                                 remainder='passthrough')

# Transform the data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Initialize and train XGBoost model
xgboost = XGBClassifier()
xgboost.fit(X_train_encoded, y_train)

# Cross Validation
scores1 = cross_val_score(xgboost, X_train_encoded, y_train, cv=5, scoring='roc_auc')

# Print the cross-validation scores for each fold
print("Cross-validation scores:", scores1)

# Print the mean cross-validation score
print("Mean AUC:", scores1.mean())


Cross-validation scores: [0.64719552 0.65141186 0.64874726 0.65138304 0.6538234 ]
Mean AUC: 0.650512215018072


In [42]:
# Import the necessary libraries
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the categorical columns
categorical_columns = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company public response', 'Company',
                       'State', 'Tags', 'Consumer consent provided?', 'Submitted via',
                       'Company response to consumer', 'Timely response?']

# Apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer([('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
                                 remainder='passthrough')

# Transform the data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Initialize and train the LGBMClassifier
lgbmodel = LGBMClassifier()
lgbmodel.fit(X_train_encoded, y_train)

# Cross Validation
scores2 = cross_val_score(lgbmodel, X_train_encoded, y_train, cv=5, scoring='roc_auc')

# Print the cross-validation scores for each fold
print("Cross-validation scores:", scores2)

# Print the mean cross-validation score
print("Mean AUC:", scores2.mean())


Cross-validation scores: [0.64937369 0.65211447 0.65083043 0.6526629  0.65636906]
Mean AUC: 0.6522701095811125


In [43]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Define the hyperparameter space
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': sp_randint(100, 1000),
    'max_depth': sp_randint(3, 10),
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Initialize the LGBMClassifier
lgbmodel = LGBMClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbmodel,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='roc_auc',
    n_jobs=-1,  # Use all available CPU cores
    cv=5,  # Number of cross-validation folds
    random_state=42
)

# Perform the random search
random_search.fit(X_train_encoded, y_train)

# Print the best parameters found
print("Best parameters found:")
print(random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Perform predictions and evaluation with the best model
train_predictions = best_model.predict_proba(X_train_encoded)[:, 1]
test_predictions = best_model.predict_proba(X_test_encoded)[:, 1]

train_auc = roc_auc_score(y_train, train_predictions)
test_auc = roc_auc_score(y_test, test_predictions)

print("Train AUC:", train_auc)
print("Test AUC:", test_auc)


Best parameters found:
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 513, 'subsample': 0.7}
Train AUC: 0.6908827604592367
Test AUC: 0.6552557660369753


In [34]:
# Preprocess the test data
X_test_encoded = preprocessor.transform(X_test)

# Model Evaluation
train_predictions_xg = xgboost.predict_proba(X_train_encoded)[:, 1]
test_predictions_xg = xgboost.predict_proba(X_test_encoded)[:, 1]

train_auc_xg = roc_auc_score(y_train, train_predictions_xg)
test_auc_xg = roc_auc_score(y_test, test_predictions_xg)

print("Train AUC:", train_auc_xg)
print("Test AUC:", test_auc_xg)


Train AUC: 0.6841794108859887
Test AUC: 0.6522037004069259


In [29]:
# Model Evaluation
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

train_auc = roc_auc_score(y_train, train_predictions)
test_auc = roc_auc_score(y_test, test_predictions)

print("Train AUC:", train_auc)
print("Test AUC:", test_auc)

Train AUC: 0.9987300411721269
Test AUC: 0.530112358167221


In [47]:
# Update feature names in train and test datasets
train_data_final = train_data_final.drop('Consumer disputed?', axis=1, errors='ignore')  # Remove the target variable if it exists
test_data_final = test_data_final[train_data_final.columns]  # Keep only the matching columns

# Preprocess the test data
test_data_final_encoded = preprocessor.transform(test_data_final)

# Predict on the test dataset
prediction = lgbmodel.predict_proba(test_data_final_encoded)[:, 1]

# Create the submission dataframe
submission = pd.DataFrame({'Complaint ID': test_copy['Complaint ID'], 'Consumer disputed?': prediction})
submission['Consumer disputed?'] = np.where(submission['Consumer disputed?'] > 0.5, 'Yes', 'No')

# Export the submission dataframe to a CSV file
submission.to_csv('sample_submission.csv', index=False)
print(submission.shape)


(119606, 2)
