In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import QuantileTransformer,RobustScaler
from sklearn.preprocessing import PowerTransformer,MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [23]:
def maxabs_scaler(X_train,X_test):
    rs = MaxAbsScaler()
    X_train4 = X_train.copy()
    X_test4  = X_test.copy()
    
    X_train4 = rs.fit_transform(X_train4)
    X_test4 = rs.transform(X_test4)
    
    return X_train4,X_test4

In [24]:
def robust_scaler(X_train,X_test):
    rs = RobustScaler()
    X_train3 = X_train.copy()
    X_test3  = X_test.copy()
    
    X_train3 = rs.fit_transform(X_train3)
    X_test3 = rs.transform(X_test3)
    
    return X_train3,X_test3

In [25]:
def quantile_transforemer_scaler(X_train,X_test):
    qts = QuantileTransformer()
    X_train5 = X_train.copy()
    X_test5  = X_test.copy()
    
    X_train5 = qts.fit_transform(X_train5)
    X_test5 = qts.transform(X_test5)
    
    return X_train5,X_test5

In [26]:
def min_max_scaler(X_train,X_test):
    mn = MinMaxScaler()
    X_train2 = X_train.copy()
    X_test2  = X_test.copy()
    
    X_train2 = mn.fit_transform(X_train2)
    X_test2 = mn.transform(X_test2)
    
    return X_train2,X_test2

In [27]:
def power_transforemer_scaler(X_train,X_test):
    pts = PowerTransformer()
    X_train6 = X_train.copy()
    X_test6  = X_test.copy()
    
    X_train6 = pts.fit_transform(X_train6)
    X_test6 = pts.transform(X_test6)
    
    return X_train6,X_test6

In [28]:
def standard_scaling(X_train,X_test):
    sc = StandardScaler()
    X_train1 = X_train.copy()
    X_test1  = X_test.copy()
    
    X_train1 = sc.fit_transform(X_train1)
    X_test1 = sc.transform(X_test1)
    
    return X_train1,X_test1

In [29]:
def split_train_test_data(dataset,predicting_label):
    
    from sklearn.model_selection import train_test_split
    y = dataset[predicting_label]
    X = dataset.drop(predicting_label,axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [30]:
def cardinality_cols(X_train):
    
    low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"] 
    high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 10 and X_train[cname].dtype == "object"] 
    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']] 
    
    return low_cardinality_cols,high_cardinality_cols,numerical_cols
    
    

In [31]:
def spliteddata_shape(X_train,X_test,y_train,y_test):
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [32]:
def score_dataset(X_train,X_test,y_train,y_test,predicted_label_type="object"):
    from sklearn.metrics import mean_absolute_error,confusion_matrix
    from sklearn.metrics import r2_score,accuracy_score
    
    if predicted_label_type=="object":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(confusion_matrix(y_test,preds))
        print()
        print(accuracy_score(y_test,preds))
    
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(mean_absolute_error(y_test, preds)) 
        print(f'R square: {r2_score(y_test,preds)}')

In [33]:
def drop_categorical(X_train,X_test,y_train,y_test):
    
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_test.select_dtypes(exclude=['object'])

    print("MAE from Approach 1 (Drop categorical variables):")
    score_dataset(drop_X_train, drop_X_valid, y_train, y_test)

In [34]:
def good_and_bad_labels(X_train,X_test):
    object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                       set(X_test[col]).issubset(set(X_train[col]))]
        
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
    print('Categorical columns that will be ordinal encoded:', good_label_cols)
    print('Categorical columns that will be dropped from the dataset:', bad_label_cols)
    
    return good_label_cols,bad_label_cols

In [35]:

def ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols):
    from sklearn.preprocessing import OrdinalEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)

    ordinal_encoder = OrdinalEncoder()
    label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])
    
    return label_X_train,label_X_valid

In [36]:
def remove_nullvalues(dataset):
    dataset1 = dataset.copy()
    dataset1 = dataset.dropna(axis=0)
    return dataset1

In [37]:
def fillvaluesbymean(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.mean())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [38]:
def fillvaluesbymedian(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.median())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [39]:
def fillvaluesbymode(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    for i in a.columns:
        dataset1[i] = dataset1[i].fillna(dataset1[i].mode()[0])
    dataset1[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [40]:
def fillvaluesbybfill(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(method='bfill')
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [41]:
#dataset1

In [42]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx")

In [43]:
#remove null values
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(615, 14)
(615,)
(303, 14)
(303,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
1.6864864057834352
R square: 0.9955956530454186


In [44]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
3.1792161874334406
R square: 0.9648285666699064


In [45]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
3.065188041989958
R square: 0.968481713122613


In [46]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
3.020348052639586
R square: 0.9693834402502652


In [47]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
2.6486208732694365
R square: 0.9780981181128975


In [48]:
#dataset2

In [49]:
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 2\MISSING VALUES\data.csv")

In [50]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(349, 31)
(349,)
(172, 31)
(172,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[102   1]
 [  9  60]]

0.9418604651162791


In [51]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[119   2]
 [  4  63]]

0.9680851063829787


In [52]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  6  61]]

0.9521276595744681


In [53]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  4  63]]

0.9627659574468085


In [54]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[119   2]
 [  5  62]]

0.9627659574468085


In [55]:
#dataset 3
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 3\MISSING VALUES\winequality-red.csv")

In [56]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1046, 11)
(1046,)
(516, 11)
(516,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   1   0   0   0]
 [  1   0   8   5   1   0]
 [  0   0 173  44   1   0]
 [  0   0  61 133  13   1]
 [  0   0   4  27  35   0]
 [  0   0   0   5   3   0]]

0.6608527131782945


In [57]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   1   1   0   0]
 [  0   0   9  10   0   0]
 [  0   0 167  48   2   0]
 [  0   0  48 148  17   0]
 [  0   0   2  38  29   1]
 [  0   0   0   2   4   1]]

0.6534090909090909


In [58]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 171  42   4   0]
 [  0   0  50 146  17   0]
 [  0   0   0  42  27   1]
 [  0   0   0   2   4   1]]

0.6534090909090909


In [59]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 163  50   4   0]
 [  0   0  47 149  17   0]
 [  0   0   2  39  28   1]
 [  0   0   0   0   6   1]]

0.6458333333333334


In [60]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  10   9   0   0]
 [  0   0 166  50   1   0]
 [  0   0  48 148  17   0]
 [  0   0   2  37  30   1]
 [  0   0   0   0   6   1]]

0.6534090909090909


In [62]:
#datset 4
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 4\MISSING VALUES\diabetes.csv")

In [63]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(489, 8)
(489,)
(242, 8)
(242,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[131  24]
 [ 33  54]]

0.7644628099173554


In [64]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[139  29]
 [ 33  53]]

0.7559055118110236


In [65]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[141  27]
 [ 35  51]]

0.7559055118110236


In [66]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[137  31]
 [ 32  54]]

0.7519685039370079


In [67]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[137  31]
 [ 33  53]]

0.7480314960629921


In [68]:
#dataset 5
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 5\MISSING VALUES\Child Immunization Dataset.xls")

In [70]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(434, 39)
(434,)
(214, 39)
(214,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'IS_P']

error and metrics
[[112   3]
 [  6  93]]

0.9579439252336449


In [71]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'IS_P']

error and metrics
[[106  11]
 [  5 101]]

0.9282511210762332


In [72]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'IS_P']

error and metrics
[[107  10]
 [  5 101]]

0.9327354260089686


In [73]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'IS_P']

error and metrics
[[108   9]
 [  5 101]]

0.9372197309417041


In [74]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'IS_P']

error and metrics
[[110   7]
 [  5 101]]

0.9461883408071748


In [75]:
#dataset6
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 6\MISSING VALUES\hmeq.csv")

In [76]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2253, 12)
(2253,)
(1111, 12)
(1111,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1009    2]
 [  53   47]]

0.9504950495049505


In [81]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1489   46]
 [ 156  276]]

0.8973055414336553


In [80]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1487   48]
 [ 149  283]]

0.8998474834773768


In [82]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1483   52]
 [ 156  276]]

0.8942552109811897


In [83]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1507   28]
 [ 193  239]]

0.887646161667514


In [84]:
#dataset7
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 7\MISSING VALUES\NFL.csv")

In [85]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(789, 17)
(789,)
(390, 17)
(390,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type']
Categorical columns that will be dropped from the dataset: ['Drafted..tm.rnd.yr.', 'School', 'Position', 'Player']

error and metrics
[[390]]

1.0


In [86]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['School', 'Drafted..tm.rnd.yr.', 'Player']

error and metrics
[[171 220]
 [105 652]]

0.7168989547038328


In [87]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['School', 'Drafted..tm.rnd.yr.', 'Player']

error and metrics
[[222 169]
 [ 56 701]]

0.804006968641115


In [88]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['School', 'Drafted..tm.rnd.yr.', 'Player']

error and metrics
[[163 228]
 [101 656]]

0.7134146341463414


dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

# this wont work because the missing values will be continous without breaks

In [89]:
#dataset 8
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\research paper\5 movie_metadata.csv")

In [90]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2516, 27)
(2516,)
(1240, 27)
(1240,)

Categorical columns that will be ordinal encoded: ['color', 'content_rating']
Categorical columns that will be dropped from the dataset: ['country', 'movie_title', 'plot_keywords', 'director_name', 'actor_3_name', 'actor_1_name', 'genres', 'language', 'movie_imdb_link', 'actor_2_name']

error and metrics
0.5275475806451612
R square: 0.5515772117878344


In [91]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['country', 'movie_title', 'plot_keywords', 'content_rating', 'director_name', 'actor_3_name', 'actor_1_name', 'genres', 'language', 'movie_imdb_link', 'actor_2_name']

error and metrics
0.5915339339339338
R square: 0.4702672427187383


In [94]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['country', 'movie_title', 'plot_keywords', 'content_rating', 'director_name', 'actor_3_name', 'actor_1_name', 'genres', 'language', 'movie_imdb_link', 'actor_2_name']

error and metrics
0.5943801801801801
R square: 0.47242372092063634


In [93]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['country', 'movie_title', 'plot_keywords', 'content_rating', 'director_name', 'actor_3_name', 'actor_1_name', 'genres', 'language', 'movie_imdb_link', 'actor_2_name']

error and metrics
0.597204804804805
R square: 0.47484437115980527


In [95]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['country', 'movie_title', 'plot_keywords', 'content_rating', 'director_name', 'actor_3_name', 'actor_1_name', 'genres', 'language', 'movie_imdb_link', 'actor_2_name']

error and metrics
0.596693093093093
R square: 0.47325685015344177


In [96]:
#dataset 9

dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 11\Adult ICU patients project.xlsx")

In [97]:
dataset1 = remove_nullvalues(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)


(791, 30)
(791,)
(390, 30)
(390,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['MRN', 'DATE OF BIRTH', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 59  35]
 [ 20 276]]

0.8589743589743589


In [98]:
dataset1 = fillvaluesbymean(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['MRN', 'DATE OF BIRTH', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[100  27]
 [ 22 281]]

0.8860465116279069


In [99]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['MRN', 'DATE OF BIRTH', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 55  30]
 [ 21 324]]

0.8813953488372093


In [100]:
dataset1 = fillvaluesbymode(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['MRN', 'DATE OF BIRTH', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 59  26]
 [ 22 323]]

0.8883720930232558


dataset1 = fillvaluesbybfill(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

#this wont work because the missing values will be continous without breaks

In [101]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 12\MISSING VALUES\2016 County Election Data.csv")

In [102]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(1993, 8)
(1993,)
(982, 8)
(982,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.794525254582485
R square: 0.708972310672983


In [103]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.706185957873618
R square: 0.7290412501567933


In [104]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.556314656914651
R square: 0.7357114843099433


In [105]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.590108324974924
R square: 0.7322709095698114


In [106]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.578029889669008
R square: 0.7372475910399404


In [107]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 13\MISSING VALUES\churn.csv")

In [108]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3308, 17)
(3308,)
(1630, 17)
(1630,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 41 239  59   6   4   0   0   0   0   0]
 [ 56 427 103  15   5   1   0   0   0   0]
 [ 25 241  63   2   4   1   0   0   0   0]
 [ 21 157  26   6   1   0   0   0   0   0]
 [  4  47  11   2  18   1   0   0   0   0]
 [  3  11   0   0   9   1   0   0   0   0]
 [  1   8   0   0   3   1   0   0   0   0]
 [  1   2   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.3411042944785276


In [109]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 36 276  25   3   5   0   0   0   0   0]
 [ 62 444  50   9   7   1   0   0   0   0]
 [ 37 298  45   3   2   0   0   0   0   0]
 [ 23 161  22   6   2   0   0   0   0   0]
 [  8  51  10   0  16   2   0   0   0   0]
 [  2  13   2   1  12   0   0   0   0   0]
 [  0   5   0   0   5   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.33151515151515154


In [110]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 31 275  32   3   4   0   0   0   0   0]
 [ 52 436  69   9   5   2   0   0   0   0]
 [ 36 280  63   2   3   1   0   0   0   0]
 [ 22 160  26   5   0   1   0   0   0   0]
 [  6  52  11   0  13   5   0   0   0   0]
 [  1  15   1   0  12   1   0   0   0   0]
 [  0   5   1   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.3327272727272727


In [111]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 30 279  27   5   4   0   0   0   0   0]
 [ 57 437  60   9   8   2   0   0   0   0]
 [ 32 286  58   4   4   1   0   0   0   0]
 [ 20 168  21   2   2   1   0   0   0   0]
 [  8  52   9   1  16   1   0   0   0   0]
 [  3  11   2   2  12   0   0   0   0   0]
 [  0   4   1   0   5   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.3290909090909091


In [112]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 37 269  31   4   4   0   0   0   0   0]
 [ 58 435  61   7  12   0   0   0   0   0]
 [ 30 302  46   4   3   0   0   0   0   0]
 [ 19 167  23   3   1   1   0   0   0   0]
 [  8  50  12   0  14   3   0   0   0   0]
 [  2  14   1   1  12   0   0   0   0   0]
 [  0   4   2   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.3242424242424242


In [113]:
#dataset11

dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 14\MISSING VALUES\healthcare-dataset-stroke-data.csv",sep=",")

In [114]:
dataset1 = remove_nullvalues(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['stroke'] = dataset1['stroke'].astype('int')


(3241, 11)
(3241,)
(1597, 11)
(1597,)

Categorical columns that will be ordinal encoded: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1518    1]
 [  77    1]]

0.9511584220413275


In [115]:
dataset1 = fillvaluesbymean(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [116]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1590    1]
 [  96    0]]

0.942501481920569


In [117]:
dataset1 = fillvaluesbymode(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1590    1]
 [  96    0]]

0.942501481920569


In [118]:
dataset1 = fillvaluesbybfill(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1587    4]
 [  96    0]]

0.9407231772377


In [119]:
#dataset 12
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 15\MISSING VALUES\abalone.csv",sep=",")

In [121]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2767, 8)
(2767,)
(1364, 8)
(1364,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.563966275659824
R square: 0.5436768520180655


In [122]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5732124728063814
R square: 0.5188815664528883


In [123]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5747715736040606
R square: 0.5141776514466729


In [124]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5831472081218274
R square: 0.512148679024566


In [125]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5739738941261785
R square: 0.5155911364859544


In [126]:
#dataset 13
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 16\MISSING VALUES\Pokemon.csv",sep=",")

In [127]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(369, 12)
(369,)
(183, 12)
(183,)

Categorical columns that will be ordinal encoded: ['type1', 'type2']
Categorical columns that will be dropped from the dataset: ['name']

error and metrics
[[151   8]
 [  1  23]]

0.9508196721311475


In [130]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[309   3]
 [  4  38]]

0.980225988700565


In [131]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[308   4]
 [  4  38]]

0.9774011299435028


In [132]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[309   3]
 [  5  37]]

0.9774011299435028


In [133]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[308   4]
 [  7  35]]

0.9689265536723164
