In [13]:
# First, we need to install the libraries that we need

# Notes on the libraries
# Numpy is Python's array data structure
# Pandas are Python's main data structure, they are like excel spreadsheet
# Openpyxl is the file that allows Python to open and read excel files
# Statsmodels contains Python's built-in machine learning models
# Scikit-learn is a machine learning and AI library
# Seaborn and matplot lib are used to create graphs
# We use !pip to install libraries
# The ! is the equivalent of a terminal command
!pip install pandas
!pip install numpy
!pip install openpyxl
!pip install statsmodels
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn



In [14]:
# Instantiate the libraries and read the file
# Logistic regreession requires all numbers
# We will use Encoding to convert from 
# a category (e.g., Gender) to a number
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
# Read in the file, print some basic information
filepath = '/Downloads/'
filename = 'car_sales.xlsx'
df = pd.read_excel(filepath + filename, sheet_name = 'sheet1')
# Print some basic information
print("Size of the dataset")
print(df.shape)
print("\n" + "The first 5 records")
print(df.head(5))
print("\n" + "Columns and their datatypes")
print(df.dtypes)

NameError: name 'pd' is not defined

In [16]:
# Some basic features engineering
# Let's see the % who financed 
# Determined if the dataset is unbalanced
percent_financed = (sum(df['Financing Indicator'] == 'Financing') / len(df)) * 100
formatted_percent = f"{percent_financed:.2f}%"
print("The percentage of customers who financed their purchase is  " + formatted_percent)
print("")

# Nulls are an issue - let's see which columns have them
for column in df.columns:
    percent_null = (df[column].isnull().sum() / len(df)) * 100
    if percent_null > 0:
        print(f"The column {column} has {percent_null:.2f}% null values")
    else:
        print("The column " + column +  " does not have any null values")


The percentage of customers who financed their purchase is  36.68%

The column Car_id does not have any null values
The column Date does not have any null values
The column Purchase_Month does not have any null values
The column Customer Name has 0.00% null values
The column Gender does not have any null values
The column Annual Income does not have any null values
The column Income_Proxy does not have any null values
The column Financing Indicator does not have any null values
The column Dealer_Name does not have any null values
The column Company does not have any null values
The column Model does not have any null values
The column Engine does not have any null values
The column Transmission does not have any null values
The column Color does not have any null values
The column Price ($) does not have any null values
The column Price_Proxy does not have any null values
The column Dealer_No does not have any null values
The column Body Style does not have any null values
The column P

In [17]:
# Let's build the model
# Let's drop Name, PassengerId, Ticket
# They do not have predeictive value
df_model = df.drop(['Customer Name'], axis=1)
print(df_model['Dealer_Region'].unique())
print(df_model['Financing Indicator'].unique())

# Clean the data by filtering out invalid values (e.g., 'January')
df_model = df_model[~df_model['Dealer_Region'].isin(['January'])]
df_model['Dealer_Region'] = df_model['Dealer_Region'].fillna('Unknown').astype(str)
df_model['Financing Indicator'] = df_model['Financing Indicator'].fillna('Unknown').astype(str)

# Encode categorical variables
label_encoder = LabelEncoder()
df_model['Model'] = label_encoder.fit_transform(df_model['Model'])
df_model['Dealer_Region'] = label_encoder.fit_transform(df_model['Dealer_Region'])
df_model['Financing Indicator'] = label_encoder.fit_transform(df_model['Financing Indicator'])

# Calculate and print the correlation matrix
print(df_model.corr(numeric_only=True))

['Middletown' 'Aurora' 'Greenville' 'Pasco' 'Janesville' 'Scottsdale'
 'Austin']
['Financing' 'Cash']
                     Annual Income  Financing Indicator     Model  Price ($)  \
Annual Income             1.000000            -0.568909 -0.006852   0.012065   
Financing Indicator      -0.568909             1.000000  0.027851   0.296656   
Model                    -0.006852             0.027851  1.000000   0.052608   
Price ($)                 0.012065             0.296656  0.052608   1.000000   
Phone                    -0.000351             0.000201 -0.001737  -0.000493   
Dealer_Region            -0.006084            -0.001262 -0.005928  -0.009400   

                        Phone  Dealer_Region  
Annual Income       -0.000351      -0.006084  
Financing Indicator  0.000201      -0.001262  
Model               -0.001737      -0.005928  
Price ($)           -0.000493      -0.009400  
Phone                1.000000      -0.008127  
Dealer_Region       -0.008127       1.000000  


In [18]:
# Going to do a logistic model using Sex, Pclass
# Note: Fare and Pclass are correlated
X = sm.add_constant(df_model[['Annual Income', 'Price ($)', 'Dealer_Region', 'Model']])
y = df_model['Financing Indicator']


# Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72)

# Build the logistic regression model
model_1 = sm.Logit(y_train, X_train).fit()
print(model_1.summary())

# Calculate predictions and model accuracy
y_pred = model_1.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Apply 0.5 threshold for classification

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"The model has an accuracy of {accuracy:.2f}%.")

# Step: Predict financing for the entire dataset
df_model['Predicted_Financing'] = (model_1.predict(X) > 0.5).astype(int)

# Export the predictions along with the original data
df_model[['Car_id', 'Annual Income', 'Price ($)', 'Dealer_Region', 'Model', 'Predicted_Financing']].to_csv('car_sales_predictions3.csv', index=False)
print("Predictions saved to car_sales_predictions3.csv")

Optimization terminated successfully.
         Current function value: 0.278472
         Iterations 8
                            Logit Regression Results                           
Dep. Variable:     Financing Indicator   No. Observations:                19124
Model:                           Logit   Df Residuals:                    19119
Method:                            MLE   Df Model:                            4
Date:                 Sun, 27 Oct 2024   Pseudo R-squ.:                  0.5766
Time:                         18:03:51   Log-Likelihood:                -5325.5
converged:                        True   LL-Null:                       -12577.
Covariance Type:             nonrobust   LLR p-value:                     0.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.0372      0.080     -0.463      0.643      -0.195       0.120
Annual Incom