In [92]:
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Desktop\Spreadsheets\credit_score.csv")

def config_data(data):
    
    """
    Configuring the columns in the dataframe
    """
    
    data.columns = data.columns.str.strip()
    data.columns = data.columns.str.lower()
    return data
df = config_data(df)



def inspect(data):

    """
    Inspecting the data checking for the shape, structure, missing value, duplicated rows and number of unique values in each of the fatures on the dataframe in the dataset
    """
    
    print("checking the columns in the dataset")
    print()
    print(data.columns)

    print()
    print('The first five(5) rows in the dataset')
    print(data.head(5))

    print()
    print("The Number of rows and columns in the dataset")
    print(data.shape)
    print()

    print('The structure and datatype of the dataset')
   
    print()
    print(data.info())
    print()

    for value in data.isna().sum():
        if value > 1:
            print(f"There are {value} detected")
        else:
            print('There is no missing value detected')


    print()
    print(f"There are {data.duplicated().sum()} duplicated rows in the dataset")

    print()
    print(f"The number of unique values in the variables")
    print(data.nunique())
inspect(df)

print()


def split(data):

    """
    Splitting the data into Independent variables and target variable 
    """
    
    
    x = data.drop(['credit_score','cust_id','cat_gambling'],axis=1)
    y = data.credit_score
    return x, y 

x, y = split(df)
num_col = x.select_dtypes(include=['int','float']).columns,
cat_col = x.select_dtypes(include=['object']).columns

    
def train(x,y):

    """
    Using a supervised approach to train and test the sets for modeling 
    """
    
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25, random_state=42)
    return x_train,x_test,y_train,y_test

x_train,x_test,y_train,y_test = train(x,y)
    


def process(df):

    """
    Processing and transforming the dataset for modeling 
    """
    
    from sklearn.preprocessing import StandardScaler, OrdinalEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer

   
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most frequent')),
        ('scaler', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    transformer = ColumnTransformer(
        transformers=[
            ('num_pipe', num_pipe, num_col),
            ('cat_pipe', cat_pipe, cat_col)
        ],
        remainder = 'passthrough'
    )
    return transformer
    
processor = process(df)


def models(a,b):

    """
    Buidling an algorithm tp predict credit score of user
    """
    
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from xgboost import XGBRegressor
    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

    
    models = {
        'LR': LinearRegression(),
        'RF': RandomForestRegressor(),
        'TREE': DecisionTreeRegressor(), 
        'KN': KNeighborsRegressor(),
        'xgb': XGBRegressor()
    }

    print("MODEL/ALGORITHM PERFOMANCE EVALUATION USING MEAN ABSOLUTE ERROR(MAE) AND MEAN ABSOLUTE PERCENTAGE ERROR(MAPE) METRICS:")
    print()
    for name, model in models.items():
        prediction = model.fit(a,b).predict(x_test)
        
        print(f"The MAE of the {name} is {mean_absolute_error(y_test, prediction)}")
        print(f"The MAPE of the {name} is {mean_absolute_percentage_error(y_test, prediction)}")
        
models(x_train, y_train)       

checking the columns in the dataset

Index(['cust_id', 'income', 'savings', 'debt', 'r_savings_income',
       'r_debt_income', 'r_debt_savings', 't_clothing_12', 't_clothing_6',
       'r_clothing', 'r_clothing_income', 'r_clothing_savings',
       'r_clothing_debt', 't_education_12', 't_education_6', 'r_education',
       'r_education_income', 'r_education_savings', 'r_education_debt',
       't_entertainment_12', 't_entertainment_6', 'r_entertainment',
       'r_entertainment_income', 'r_entertainment_savings',
       'r_entertainment_debt', 't_fines_12', 't_fines_6', 'r_fines',
       'r_fines_income', 'r_fines_savings', 'r_fines_debt', 't_gambling_12',
       't_gambling_6', 'r_gambling', 'r_gambling_income', 'r_gambling_savings',
       'r_gambling_debt', 't_groceries_12', 't_groceries_6', 'r_groceries',
       'r_groceries_income', 'r_groceries_savings', 'r_groceries_debt',
       't_health_12', 't_health_6', 'r_health', 'r_health_income',
       'r_health_savings', 'r_health_de

# Project Report: Credit Score Prediction

## Introduction

This project aims to predict credit scores based on various financial and personal attributes of individuals. The dataset used for this project contains information on income, savings, debt, and other financial behaviors, along with categorical variables such as gambling habits, debt status, and credit card usage. The target variable is the credit score, which is a numerical value indicating an individual's creditworthiness.

## Data Preparation

The dataset was initially loaded from a CSV file located at `C:\Users\user\Desktop\Spreadsheets\credit_score.csv`. The data was then processed to ensure consistency and readiness for analysis. The following steps were taken:

1. **Column Renaming**: All column names were stripped of leading and trailing spaces and converted to lowercase to ensure uniformity and ease of reference.
2. **Data Inspection**: The dataset was inspected to understand its structure, including the columns, the first five rows, the number of rows and columns, and the data types of each column.
3. **Missing Value Check**: The dataset was checked for missing values. No missing values were detected across any of the columns.
4. **Duplicate Rows Check**: The dataset was checked for duplicate rows. No duplicate rows were found.
5. **Unique Values Count**: The number of unique values in each variable was counted to understand the diversity of the data.

## Data Splitting

The dataset was split into features (X) and the target variable (credit score, Y). The features included all columns except for 'credit_score', 'cust_id', and 'cat_gambling'. The target variable was the 'credit_score' column.

## Data Preprocessing

The data was preprocessed using a pipeline that included imputation and scaling for both numerical and categorical variables. The numerical variables were scaled using the StandardScaler, while the categorical variables were encoded using the OrdinalEncoder. The imputation strategy for numerical variables was the mean, and for categorical variables, the most frequent category was used.

## Model Training

The dataset was split into training and testing sets, with 75% of the data used for training and 25% for testing. The following models were evaluated for their performance in predicting credit scores:

- Linear Regression (LR)
- Random Forest Regressor (RF)
- Decision Tree Regressor (TREE)
- K-Nearest Neighbors Regressor (KN)
- XGBoost Regressor (xgb)

Each model was trained on the training set and evaluated on the testing set using Mean Absolute Error (MAE) and Mean Absolute Percentage Error (MAPE) as performance metrics.

## Results

The performance of the models was evaluated based on the MAE and MAPE metrics. The results are as follows:

- **Linear Regression (LR)**: MAE = 21.92, MAPE = 0.0381
- **Random Forest Regressor (RF)**: MAE = 21.72, MAPE = 0.0376
- **Decision Tree Regressor (TREE)**: MAE = 31.77, MAPE = 0.0552
- **K-Nearest Neighbors Regressor (KN)**: MAE = 34.77, MAPE = 0.0626
- **XGBoost Regressor (xgb)**: MAE = 23.35, MAPE = 0.0404

## Conclusion

The project successfully demonstrated the process of predicting credit scores using various machine learning models. The XGBoost Regressor showed the best performance in terms of both MAE and MAPE, indicating its effectiveness in predicting credit scores based on the given dataset. This model could be further refined and validated using additional data or through cross-validation techniques to improve its predictive accuracy.