In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Load the data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')


  train_df = pd.read_csv('/content/train.csv')


In [None]:
# Function to clean data
def clean_data(df):
    # Convert columns to appropriate data types
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df['Annual_Income'] = pd.to_numeric(df['Annual_Income'], errors='coerce')
    df['Outstanding_Debt'] = pd.to_numeric(df['Outstanding_Debt'], errors='coerce')
    df['Monthly_Balance'] = pd.to_numeric(df['Monthly_Balance'], errors='coerce')
    df['Num_of_Delayed_Payment'] = pd.to_numeric(df['Num_of_Delayed_Payment'], errors='coerce')
    df['Changed_Credit_Limit'] = pd.to_numeric(df['Changed_Credit_Limit'], errors='coerce')
    df['Amount_invested_monthly'] = pd.to_numeric(df['Amount_invested_monthly'], errors='coerce')

    # Handle missing values
    df['Monthly_Inhand_Salary'].fillna(df['Monthly_Inhand_Salary'].median(), inplace=True)
    df['Num_Credit_Inquiries'].fillna(df['Num_Credit_Inquiries'].median(), inplace=True)
    df['Amount_invested_monthly'].fillna(df['Amount_invested_monthly'].median(), inplace=True)
    df['Num_of_Delayed_Payment'].fillna(df['Num_of_Delayed_Payment'].median(), inplace=True)
    df['Monthly_Balance'].fillna(df['Monthly_Balance'].median(), inplace=True)
    df['Credit_History_Age'] = df['Credit_History_Age'].str.extract('(\d+)').astype(float).fillna(df['Credit_History_Age'].str.extract('(\d+)').astype(float).median())

    return df

In [None]:
# Clean the datasets
train_df_cleaned = clean_data(train_df)
test_df_cleaned = clean_data(test_df)

# Extracting features and target from training data
X_train = train_df_cleaned.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', 'Credit_Score', 'Month'])
y_train = train_df_cleaned['Credit_Score']

In [None]:
# Identify high cardinality features (more than 20 unique categories)
categorical_features = X_train.select_dtypes(include=['object']).columns
high_cardinality_features = [col for col in categorical_features if X_train[col].nunique() > 20]
low_cardinality_features = [col for col in categorical_features if X_train[col].nunique() <= 20]

# Label encode high cardinality features
label_encoders = {col: LabelEncoder().fit(X_train[col]) for col in high_cardinality_features}
for col, encoder in label_encoders.items():
    X_train[col] = encoder.transform(X_train[col])

In [None]:
# One-hot encode low cardinality features
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded_low_cardinality = one_hot_encoder.fit_transform(X_train[low_cardinality_features])

# Convert the encoded features to a DataFrame and concatenate with the rest of the data
X_train_encoded_low_cardinality_df = pd.DataFrame(X_train_encoded_low_cardinality, columns=one_hot_encoder.get_feature_names_out(low_cardinality_features))
X_train = pd.concat([X_train.drop(columns=low_cardinality_features).reset_index(drop=True), X_train_encoded_low_cardinality_df.reset_index(drop=True)], axis=1)




In [None]:
# Ensure no missing values are left
X_train.fillna(X_train.median(), inplace=True)

# Normalize numerical features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)

# Splitting the data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_normalized, y_train, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Train a logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_final, y_train_final)

# Predict on the validation set
y_val_pred = logistic_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
f1 = f1_score(y_val, y_val_pred, average='weighted')
roc_auc = roc_auc_score(y_val, logistic_model.predict_proba(X_val), multi_class='ovr')

# Show the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(f'ROC-AUC: {roc_auc}')


Accuracy: 0.62445
Precision: 0.6260779002595036
Recall: 0.62445
F1-score: 0.619511326002176
ROC-AUC: 0.7886096670714432


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train a decision tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_final, y_train_final)

# Predict on the validation set
y_val_pred_dt = decision_tree_model.predict(X_val)

# Evaluate the decision tree model
accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
precision_dt = precision_score(y_val, y_val_pred_dt, average='weighted')
recall_dt = recall_score(y_val, y_val_pred_dt, average='weighted')
f1_dt = f1_score(y_val, y_val_pred_dt, average='weighted')
roc_auc_dt = roc_auc_score(y_val, decision_tree_model.predict_proba(X_val), multi_class='ovr')

# Show the evaluation metrics for decision tree
print(f'Decision Tree - Accuracy: {accuracy_dt}')
print(f'Decision Tree - Precision: {precision_dt}')
print(f'Decision Tree - Recall: {recall_dt}')
print(f'Decision Tree - F1-score: {f1_dt}')
print(f'Decision Tree - ROC-AUC: {roc_auc_dt}')

# Train a random forest model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_final, y_train_final)

# Predict on the validation set
y_val_pred_rf = random_forest_model.predict(X_val)

# Evaluate the random forest model
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
precision_rf = precision_score(y_val, y_val_pred_rf, average='weighted')
recall_rf = recall_score(y_val, y_val_pred_rf, average='weighted')
f1_rf = f1_score(y_val, y_val_pred_rf, average='weighted')
roc_auc_rf = roc_auc_score(y_val, random_forest_model.predict_proba(X_val), multi_class='ovr')

# Show the evaluation metrics for random forest
print(f'Random Forest - Accuracy: {accuracy_rf}')
print(f'Random Forest - Precision: {precision_rf}')
print(f'Random Forest - Recall: {recall_rf}')
print(f'Random Forest - F1-score: {f1_rf}')
print(f'Random Forest - ROC-AUC: {roc_auc_rf}')


Decision Tree - Accuracy: 0.6974
Decision Tree - Precision: 0.697572397129452
Decision Tree - Recall: 0.6974
Decision Tree - F1-score: 0.6974741273266262
Decision Tree - ROC-AUC: 0.7527277657734691
Random Forest - Accuracy: 0.78065
Random Forest - Precision: 0.7803768541921579
Random Forest - Recall: 0.78065
Random Forest - F1-score: 0.7804956645158008
Random Forest - ROC-AUC: 0.9050128412352693


In [None]:


# Prepare the test data (following similar steps as for training data)
X_test = test_df_cleaned.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', 'Month'])

# Label encode high cardinality features in test data
for col, encoder in label_encoders.items():
    # If there are categories in the test set that were not seen in the training set, replace them with a special value
    X_test[col] = X_test[col].map(lambda s: '<unknown>' if s not in encoder.classes_ else s)
    encoder.classes_ = np.append(encoder.classes_, '<unknown>')
    X_test[col] = encoder.transform(X_test[col])

# One-hot encode low cardinality features in test data
X_test_encoded_low_cardinality = one_hot_encoder.transform(X_test[low_cardinality_features])
X_test_encoded_low_cardinality_df = pd.DataFrame(X_test_encoded_low_cardinality, columns=one_hot_encoder.get_feature_names_out(low_cardinality_features))
X_test = pd.concat([X_test.drop(columns=low_cardinality_features).reset_index(drop=True), X_test_encoded_low_cardinality_df.reset_index(drop=True)], axis=1)

# Ensure no missing values are left
X_test.fillna(X_test.median(), inplace=True)

# Normalize numerical features in test data
X_test_normalized = scaler.transform(X_test)

# Use the best model (e.g., Random Forest) to predict on test data
y_test_pred = random_forest_model.predict(X_test_normalized)

# Output the predictions
predictions = pd.DataFrame({'ID': test_df['ID'], 'Predicted_Credit_Score': y_test_pred})
predictions.to_csv('predicted_credit_scores.csv', index=False)
print("Predictions saved to 'predicted_credit_scores.csv'")


Predictions saved to 'predicted_credit_scores.csv'
