In [1]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import QuantileTransformer,RobustScaler
from sklearn.preprocessing import PowerTransformer,MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [2]:
def split_train_test_data(dataset,predicting_label):
    
    from sklearn.model_selection import train_test_split
    y = dataset[predicting_label]
    X = dataset.drop(predicting_label,axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [3]:
def cardinality_cols(X_train):
    
    low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"] 
    high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 10 and X_train[cname].dtype == "object"] 
    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']] 
    
    return low_cardinality_cols,high_cardinality_cols,numerical_cols
    
    

In [4]:
def fillvaluesbybfill(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(method='bfill')
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [5]:
def spliteddata_shape(X_train,X_test,y_train,y_test):
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [6]:
def score_dataset(X_train,X_test,y_train,y_test,predicted_label_type="object"):
    from sklearn.metrics import mean_absolute_error,confusion_matrix
    from sklearn.metrics import r2_score,accuracy_score
    
    if predicted_label_type=="object":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(confusion_matrix(y_test,preds))
        print()
        print(accuracy_score(y_test,preds))
    
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(n_estimators=100, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(mean_absolute_error(y_test, preds)) 
        print(f'R square: {r2_score(y_test,preds)}')

In [7]:
def drop_categorical(X_train,X_test,y_train,y_test):
    
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_test.select_dtypes(exclude=['object'])

    print("MAE from Approach 1 (Drop categorical variables):")
    score_dataset(drop_X_train, drop_X_valid, y_train, y_test)

In [8]:
def good_and_bad_labels(X_train,X_test):
    object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                       set(X_test[col]).issubset(set(X_train[col]))]
        
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
    print('Categorical columns that will be ordinal encoded:', good_label_cols)
    print('Categorical columns that will be dropped from the dataset:', bad_label_cols)
    
    return good_label_cols,bad_label_cols

In [9]:

def ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols):
    from sklearn.preprocessing import OrdinalEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)

    ordinal_encoder = OrdinalEncoder()
    label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])
    
    return label_X_train,label_X_valid

In [10]:
def onehot_encoding(X_train,X_test,y_train,y_test):
    from sklearn.preprocessingprocessing import OneHotEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)
    # Apply ordinal encoder 
    # Your code here
    
    onehot_encoder = OneHotEncoder()
    label_X_train[good_label_cols] = onehot_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = onehot_encoder.transform(X_test[good_label_cols])
#     print("MAE from Approach 2 (Ordinal Encoding):") 
#     score_dataset(label_X_train, label_X_valid, y_train, y_test)
    return label_X_train,label_X_valid



In [11]:
def standard_scaling(X_train,X_test):
    sc = StandardScaler()
    X_train1 = X_train.copy()
    X_test1  = X_test.copy()
    
    X_train1 = sc.fit_transform(X_train1)
    X_test1 = sc.transform(X_test1)
    
    return X_train1,X_test1

In [12]:
def min_max_scaler(X_train,X_test):
    mn = MinMaxScaler()
    X_train2 = X_train.copy()
    X_test2  = X_test.copy()
    
    X_train2 = mn.fit_transform(X_train2)
    X_test2 = mn.transform(X_test2)
    
    return X_train2,X_test2

In [13]:
def robust_scaler(X_train,X_test):
    rs = RobustScaler()
    X_train3 = X_train.copy()
    X_test3  = X_test.copy()
    
    X_train3 = rs.fit_transform(X_train3)
    X_test3 = rs.transform(X_test3)
    
    return X_train3,X_test3

In [14]:
def maxabs_scaler(X_train,X_test):
    rs = MaxAbsScaler()
    X_train4 = X_train.copy()
    X_test4  = X_test.copy()
    
    X_train4 = rs.fit_transform(X_train4)
    X_test4 = rs.transform(X_test4)
    
    return X_train4,X_test4

In [15]:
def quantile_transforemer_scaler(X_train,X_test):
    qts = QuantileTransformer()
    X_train5 = X_train.copy()
    X_test5  = X_test.copy()
    
    X_train5 = qts.fit_transform(X_train5)
    X_test5 = qts.transform(X_test5)
    
    return X_train5,X_test5

In [16]:
def power_transforemer_scaler(X_train,X_test):
    pts = PowerTransformer()
    X_train6 = X_train.copy()
    X_test6  = X_test.copy()
    
    X_train6 = pts.fit_transform(X_train6)
    X_test6 = pts.transform(X_test6)
    
    return X_train6,X_test6

In [17]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = standard_scaling(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics
2.6659787007454736
R square: 0.9773587238892609


In [18]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = min_max_scaler(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics
2.6401368477103286
R square: 0.9774114939310131


In [19]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = quantile_transforemer_scaler(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics




2.6216863684771017
R square: 0.9760231517191219


In [20]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = robust_scaler(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics
2.5949909478168256
R square: 0.9783680749199166


In [21]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = power_transforemer_scaler(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics
2.626476038338657
R square: 0.9782734199479659


In [22]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx") 
dataset1 = fillvaluesbybfill(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
X_train_encoded,X_test_encoded = maxabs_scaler(X_train_encoded,X_test_encoded) 
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']
error and metrics
2.65039190628328
R square: 0.9773194288447995
