# MODEL TRAINING

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('censusdata/adult.csv',names=['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Target'])
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [20]:
X = df.drop(labels=['Target','education_num'],axis=1)
Y = df[['Target']]

In [21]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [22]:
X['native_country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [23]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [24]:
numerical_cols

Index(['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week'], dtype='object')

In [25]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [26]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='if_binary'))
    ]
)

# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ],
    remainder='passthrough'
)

# Create the final pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor)
        # Add additional steps as needed (e.g., model)
    ]
)

In [27]:
## Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

In [28]:
'''
x=pd.DataFrame(X_train_encoded)
x
X_train_dense = X_train_encoded.toarray()
len(X_train_dense[0])
'''

'\nx=pd.DataFrame(X_train_encoded)\nx\nX_train_dense = X_train_encoded.toarray()\nlen(X_train_dense[0])\n'

In [29]:
# Fit and transform the training data
X_train_encoded = pipeline.fit_transform(X_train)
X_train_columns = pipeline['preprocessor'].get_feature_names_out()
X_train_dense = X_train_encoded.toarray()
X_train = pd.DataFrame(X_train_dense, columns=X_train_columns)

# Transform the test data
X_test_encoded = pipeline.transform(X_test)
X_test_columns = pipeline['preprocessor'].get_feature_names_out()
X_test_dense = X_test_encoded.toarray()
X_test = pd.DataFrame(X_test_dense, columns=X_test_columns)

In [30]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__fnlwgt,num_pipeline__capital_gain,num_pipeline__capital_loss,num_pipeline__hours_per_week,cat_pipeline__workclass_ ?,cat_pipeline__workclass_ Federal-gov,cat_pipeline__workclass_ Local-gov,cat_pipeline__workclass_ Never-worked,cat_pipeline__workclass_ Private,...,cat_pipeline__native_country_ Portugal,cat_pipeline__native_country_ Puerto-Rico,cat_pipeline__native_country_ Scotland,cat_pipeline__native_country_ South,cat_pipeline__native_country_ Taiwan,cat_pipeline__native_country_ Thailand,cat_pipeline__native_country_ Trinadad&Tobago,cat_pipeline__native_country_ United-States,cat_pipeline__native_country_ Vietnam,cat_pipeline__native_country_ Yugoslavia
0,0.762703,0.03505,-0.145994,-0.219309,-0.03143,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.850415,-0.527734,-0.145994,-0.219309,0.375095,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.396085,-0.187216,-0.145994,-0.219309,1.59467,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.495939,4.432379,-0.145994,-0.219309,1.59467,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.129321,-0.014657,-0.145994,3.251875,-0.19404,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
## Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
log = LogisticRegression()
model=log.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
y_pred = model.predict(X_test)

In [34]:
y_pred

array([' <=50K', ' <=50K', ' >50K', ..., ' <=50K', ' >50K', ' <=50K'],
      dtype=object)

In [35]:
accuracy_score(y_test,y_pred)

0.8582483724358186

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[5814,  400],
       [ 754, 1173]], dtype=int64)

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
dtc = DecisionTreeClassifier()
model = dtc.fit(X_train,y_train)

In [40]:
y_pred = model.predict(X_test)

In [41]:
y_pred

array([' <=50K', ' <=50K', ' >50K', ..., ' <=50K', ' >50K', ' <=50K'],
      dtype=object)

In [42]:
accuracy_score(y_test,y_pred)

0.8206608524751259

In [43]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[5426,  788],
       [ 672, 1255]], dtype=int64)

In [44]:
## model training
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [45]:
import numpy as np
def evaluate_model(true,predicted):
    accuracy = accuracy_score(true,predicted)
    cm = confusion_matrix(true,predicted)
    return accuracy, cm

In [46]:
## Train mutiple models

models = {
    'LogisticRegression' : LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier' : RandomForestClassifier(),
}

model_list=[]
accuracy_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # make prediction
    y_pred = model.predict(X_test)

    accuracy,cm=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('Accuracy score: ', accuracy)
    print('Confusion matrix: ')
    print(cm)

    accuracy_list.append(accuracy)

    print('='*40)
    print('\n')

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Model Training Performance
Accuracy score:  0.8582483724358186
Confusion matrix: 
[[5814  400]
 [ 754 1173]]


DecisionTreeClassifier
Model Training Performance
Accuracy score:  0.8175899766613438
Confusion matrix: 
[[5398  816]
 [ 669 1258]]




  model.fit(X_train,y_train)


RandomForestClassifier
Model Training Performance
Accuracy score:  0.8589853826311264
Confusion matrix: 
[[5794  420]
 [ 728 1199]]




In [79]:
model_list

['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier']