# Imports

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
# Pipelines
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

## Parameter Tuning
from sklearn.model_selection import GridSearchCV

## Metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [3]:
from data_gadgets.cleaning import Cleaner

# Reading Data

In [4]:
path = os.path.join('..', '..', 'data', 'raw', 'data_task4.csv')
data = pd.read_csv(path, encoding='ansi')

In [5]:
data.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [6]:
target = 'car purchase amount'

In [7]:
X = data.drop(target, axis=1)
y = data[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [9]:
X_test

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth
90,Kitra Kerr,scelerisque@Vivamusnon.co.uk,Palau,1,49.956014,60536.20406,8244.470226,173079.1798
254,Rooney R. Padilla,eu.turpis@accumsansed.co.uk,Philippines,0,45.260364,68499.69447,15436.799680,308445.8598
283,Quinlan U. Sears,magna@velarcueu.ca,Canada,1,40.875375,59060.08664,5841.612044,136346.3069
445,Urielle,est.congue@Nunc.edu,Canada,0,52.610045,76318.87883,6392.211408,245216.1691
461,Hillary,ut.mi.Duis@quisarcu.com,South Sudan,1,38.545834,58632.58875,12035.370790,516817.3173
...,...,...,...,...,...,...,...,...
372,"Houston, Grant O.",ut.aliquam@egetlacusMauris.ca,Bermuda,1,34.728221,72948.11812,12664.320520,360457.0496
56,Marny Vargas,nonummy.Fusce.fermentum@ligula.org,Egypt,1,33.816298,84467.78988,7772.444847,468238.7915
440,Ora,bibendum.Donec.felis@liberoestcongue.org,Uruguay,1,39.665657,57777.15558,19692.912620,601210.2803
60,Rowan Kidd,sapien.Aenean.massa@adipiscing.ca,Paraguay,1,42.058089,46689.41590,7829.565502,615765.9289


In [10]:
# X_train.to_csv('../../data/raw/X_train_task4.csv', index=False)
# y_train.to_csv('../../data/raw/y_train_task4.csv', index=False)
# X_test.to_csv('../../data/raw/X_test_task4.csv', index=False)
# y_test.to_csv('../../data/raw/y_test_task4.csv', index=False)

# Cleaning Pipeline

In [11]:
def cleaning_pipeline(df):
    cleaner = Cleaner()
    df = cleaner.headers(df)
    df = cleaner.categories(df)
    df = df.drop(['customer_name', 'customer_e-mail', 'country'], axis=1)
    
    return df

In [12]:
X_train = cleaning_pipeline(X_train)
cols = Cleaner().separate_data(X_train, None)
cols

{'target': [None],
 'time': [],
 'category': [],
 'category+': [],
 'continuous': [],
 'continuous+': ['age', 'annual_salary', 'credit_card_debt', 'net_worth'],
 'discrete': ['gender']}

weird. maybe because very few values.

# Components

In [13]:
# steps = [
#     ("imputer", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='N')),
#     ("scaler", FunctionTransformer(impute_first_letter, feature_names_out='one-to-one')),
#     ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
# ]

# impute_cabin = Pipeline(steps)

In [14]:
# Time variables pipeline
steps = [
]
time_pipe = Pipeline(steps)

# Continuous variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
]
continuous_pipe = Pipeline(steps)

# Discrete variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent', ))
]
discrete_pipe = Pipeline(steps)

# Category variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
]
category_pipe = Pipeline(steps)

In [15]:
transformers = [
    # ("time", time_pipe, cols['time']),
    ("continuous", continuous_pipe, cols['continuous+']), 
    ("discrete", discrete_pipe, cols['discrete']),
    # ("categorical", category_pipe, cols['category']),
]

preprocessor = ColumnTransformer(transformers, remainder='drop')

In [16]:
steps = [
    ('preprocessor', preprocessor), 
    ('model', LinearRegression())
]
pipe = Pipeline(steps)

# Training

In [17]:
pipe.fit(X_train, y_train)

# Finding Best Model

In [18]:
param_grid = {
    'model': [
        SVR(),
        LinearRegression(),  
        RandomForestRegressor(),
    ],
}

In [19]:
cross_validator = GridSearchCV(pipe, param_grid, cv=5, )

In [20]:
cross_validator.fit(X_train, y_train)

In [21]:
cross_validator.best_estimator_

In [22]:
cross_validator.score(X_train, y_train)

0.9999999816721253

In [23]:
cross_validator.cv_results_

{'mean_fit_time': array([0.02119718, 0.01040125, 0.64959717]),
 'std_fit_time': array([0.00312391, 0.00049057, 0.04969913]),
 'mean_score_time': array([0.01100311, 0.0053987 , 0.01360602]),
 'std_score_time': array([0.00126373, 0.00049012, 0.00049482]),
 'param_model': masked_array(data=[SVR(), LinearRegression(), RandomForestRegressor()],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model': SVR()},
  {'model': LinearRegression()},
  {'model': RandomForestRegressor()}],
 'split0_test_score': array([-0.01138927,  0.99999998,  0.90754864]),
 'split1_test_score': array([-0.0394773 ,  0.99999998,  0.94380044]),
 'split2_test_score': array([-0.1102103 ,  0.99999998,  0.89898501]),
 'split3_test_score': array([-0.09417005,  0.99999998,  0.9413271 ]),
 'split4_test_score': array([-0.00199328,  0.99999998,  0.92671713]),
 'mean_test_score': array([-0.05144804,  0.99999998,  0.92367566]),
 'std_test_score': array([4.35245906e-02, 3.

In [24]:
# feature_names = []
# for i in preprocessor.named_transformers_:
#     if i == 'remainder':
#         continue
#     features = preprocessor.named_transformers_[i].get_feature_names_out().tolist()
#     for feature in features:
#         feature_names.append(feature)
# pd.DataFrame(preprocessor.fit_transform(X_train), columns=feature_names)

# Saving Model

In [25]:
# path = os.path.join('..', '..', 'models', 'model_task4.pkl')
# with open(path, 'wb') as file:
#     pickle.dump(cross_validator.best_estimator_, file)