In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import QuantileTransformer,RobustScaler
from sklearn.preprocessing import PowerTransformer,MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [2]:
def maxabs_scaler(X_train,X_test):
    rs = MaxAbsScaler()
    X_train4 = X_train.copy()
    X_test4  = X_test.copy()
    
    X_train4 = rs.fit_transform(X_train4)
    X_test4 = rs.transform(X_test4)
    
    return X_train4,X_test4

In [3]:
def robust_scaler(X_train,X_test):
    rs = RobustScaler()
    X_train3 = X_train.copy()
    X_test3  = X_test.copy()
    
    X_train3 = rs.fit_transform(X_train3)
    X_test3 = rs.transform(X_test3)
    
    return X_train3,X_test3

In [4]:
def quantile_transforemer_scaler(X_train,X_test):
    qts = QuantileTransformer()
    X_train5 = X_train.copy()
    X_test5  = X_test.copy()
    
    X_train5 = qts.fit_transform(X_train5)
    X_test5 = qts.transform(X_test5)
    
    return X_train5,X_test5

In [5]:
def min_max_scaler(X_train,X_test):
    mn = MinMaxScaler()
    X_train2 = X_train.copy()
    X_test2  = X_test.copy()
    
    X_train2 = mn.fit_transform(X_train2)
    X_test2 = mn.transform(X_test2)
    
    return X_train2,X_test2

In [6]:
def power_transforemer_scaler(X_train,X_test):
    pts = PowerTransformer()
    X_train6 = X_train.copy()
    X_test6  = X_test.copy()
    
    X_train6 = pts.fit_transform(X_train6)
    X_test6 = pts.transform(X_test6)
    
    return X_train6,X_test6

In [7]:
def standard_scaling(X_train,X_test):
    sc = StandardScaler()
    X_train1 = X_train.copy()
    X_test1  = X_test.copy()
    
    X_train1 = sc.fit_transform(X_train1)
    X_test1 = sc.transform(X_test1)
    
    return X_train1,X_test1

In [8]:
def split_train_test_data(dataset,predicting_label):
    
    from sklearn.model_selection import train_test_split
    y = dataset[predicting_label]
    X = dataset.drop(predicting_label,axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [9]:
def cardinality_cols(X_train):
    
    low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"] 
    high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 10 and X_train[cname].dtype == "object"] 
    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']] 
    
    return low_cardinality_cols,high_cardinality_cols,numerical_cols
    
    

In [10]:
def spliteddata_shape(X_train,X_test,y_train,y_test):
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [11]:
def score_dataset(X_train,X_test,y_train,y_test,predicted_label_type="object"):
    from sklearn.metrics import mean_absolute_error,confusion_matrix
    from sklearn.metrics import r2_score,accuracy_score
    
    if predicted_label_type=="object":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(confusion_matrix(y_test,preds))
        print()
        print(accuracy_score(y_test,preds))
    
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(mean_absolute_error(y_test, preds)) 
        print(f'R square: {r2_score(y_test,preds)}')

In [12]:
def drop_categorical(X_train,X_test,y_train,y_test):
    
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_test.select_dtypes(exclude=['object'])

    print("MAE from Approach 1 (Drop categorical variables):")
    score_dataset(drop_X_train, drop_X_valid, y_train, y_test)

In [13]:
def good_and_bad_labels(X_train,X_test):
    object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                       set(X_test[col]).issubset(set(X_train[col]))]
        
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
    print('Categorical columns that will be ordinal encoded:', good_label_cols)
    print('Categorical columns that will be dropped from the dataset:', bad_label_cols)
    
    return good_label_cols,bad_label_cols

In [14]:

def ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols):
    from sklearn.preprocessing import OrdinalEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)

    ordinal_encoder = OrdinalEncoder()
    label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])
    
    return label_X_train,label_X_valid

In [15]:
def remove_nullvalues(dataset):
    dataset1 = dataset.copy()
    dataset1 = dataset.dropna(axis=0)
    return dataset1

In [16]:
def fillvaluesbymean(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.mean())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [17]:
def fillvaluesbymedian(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.median())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [18]:
def fillvaluesbymode(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    for i in a.columns:
        dataset1[i] = dataset1[i].fillna(dataset1[i].mode()[0])
    dataset1[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [19]:
def fillvaluesbybfill(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(method='bfill')
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [20]:
#dataset1

In [21]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx")

In [22]:
#remove null values
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(615, 14)
(615,)
(303, 14)
(303,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
1.6765756504221847
R square: 0.9957640424260459


In [23]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
3.096067244789289
R square: 0.9707456263113216


In [24]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
2.940763045793399
R square: 0.9720034340185706


In [25]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
3.06068903088392
R square: 0.969115605531393


In [26]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Transmission', 'Model']

error and metrics
2.591781758709874
R square: 0.9823242516640845


In [27]:
#dataset2

In [28]:
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 2\MISSING VALUES\data.csv")

In [29]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(349, 31)
(349,)
(172, 31)
(172,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[102   1]
 [  9  60]]

0.9418604651162791


In [30]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  6  61]]

0.9521276595744681


In [31]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[119   2]
 [  4  63]]

0.9680851063829787


In [32]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[119   2]
 [  5  62]]

0.9627659574468085


In [33]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  4  63]]

0.9627659574468085


In [34]:
#dataset 3
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 3\MISSING VALUES\winequality-red.csv")

In [35]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1046, 11)
(1046,)
(516, 11)
(516,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   1   0   0   0]
 [  1   0   8   5   1   0]
 [  0   0 173  44   1   0]
 [  0   0  65 130  13   0]
 [  0   0   4  28  34   0]
 [  0   0   0   6   2   0]]

0.6531007751937985


In [36]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 166  47   4   0]
 [  0   0  48 146  19   0]
 [  0   0   2  40  27   1]
 [  0   0   0   0   6   1]]

0.6439393939393939


In [37]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  10   9   0   0]
 [  0   0 167  47   3   0]
 [  0   0  53 143  17   0]
 [  0   0   0  42  27   1]
 [  0   0   0   2   4   1]]

0.6401515151515151


In [38]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  10   9   0   0]
 [  0   0 163  51   3   0]
 [  0   0  43 151  19   0]
 [  0   0   3  41  25   1]
 [  0   0   0   0   6   1]]

0.6439393939393939


In [39]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   1   1   0   0]
 [  0   0  10   9   0   0]
 [  0   0 165  48   4   0]
 [  0   0  51 148  14   0]
 [  0   0   2  39  28   1]
 [  0   0   0   3   3   1]]

0.6477272727272727


In [40]:
#datset 4
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 4\MISSING VALUES\diabetes.csv")

In [41]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(489, 8)
(489,)
(242, 8)
(242,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[131  24]
 [ 33  54]]

0.7644628099173554


In [42]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[136  32]
 [ 32  54]]

0.7480314960629921


In [43]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[137  31]
 [ 35  51]]

0.7401574803149606


In [44]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[139  29]
 [ 35  51]]

0.7480314960629921


In [45]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[139  29]
 [ 31  55]]

0.7637795275590551


In [46]:
#dataset 5
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 5\MISSING VALUES\Child Immunization Dataset.xls")

In [47]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(434, 39)
(434,)
(214, 39)
(214,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[112   3]
 [  6  93]]

0.9579439252336449


In [48]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[108   9]
 [  6 100]]

0.9327354260089686


In [49]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[107  10]
 [  5 101]]

0.9327354260089686


In [50]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[107  10]
 [  5 101]]

0.9327354260089686


In [51]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[108   9]
 [  5 101]]

0.9372197309417041


In [52]:
#dataset6
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 6\MISSING VALUES\hmeq.csv")

In [53]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2253, 12)
(2253,)
(1111, 12)
(1111,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1010    1]
 [  50   50]]

0.9540954095409541


In [54]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html

In [55]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1493   42]
 [ 147  285]]

0.9039145907473309


In [56]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1479   56]
 [ 148  284]]

0.8962887646161668


In [57]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3993, 12)
(3993,)
(1967, 12)
(1967,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1514   21]
 [ 187  245]]

0.8942552109811897


In [58]:
#dataset7
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 7\MISSING VALUES\NFL.csv")

In [59]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(789, 17)
(789,)
(390, 17)
(390,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type']
Categorical columns that will be dropped from the dataset: ['Position', 'Player', 'School', 'Drafted..tm.rnd.yr.']

error and metrics
[[390]]

1.0


In [60]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'School', 'Drafted..tm.rnd.yr.']

error and metrics
[[173 218]
 [109 648]]

0.7151567944250871


In [61]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'School', 'Drafted..tm.rnd.yr.']

error and metrics
[[222 169]
 [ 49 708]]

0.8101045296167247


In [62]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'School', 'Drafted..tm.rnd.yr.']

error and metrics
[[166 225]
 [ 96 661]]

0.7203832752613241


dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

# this wont work because the missing values will be continous without breaks

In [63]:
#dataset 8
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\research paper\5 movie_metadata.csv")

In [64]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2516, 27)
(2516,)
(1240, 27)
(1240,)

Categorical columns that will be ordinal encoded: ['color', 'content_rating']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'country', 'actor_2_name', 'language', 'movie_imdb_link', 'genres', 'movie_title', 'plot_keywords', 'actor_1_name', 'director_name']

error and metrics
0.5310314516129032
R square: 0.5508479427162967


In [65]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'country', 'actor_2_name', 'language', 'movie_imdb_link', 'genres', 'content_rating', 'movie_title', 'plot_keywords', 'actor_1_name', 'director_name']

error and metrics
0.5913795795795794
R square: 0.4727395734249411


In [72]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'country', 'actor_2_name', 'language', 'movie_imdb_link', 'genres', 'content_rating', 'movie_title', 'plot_keywords', 'actor_1_name', 'director_name']

error and metrics
0.5917405405405405
R square: 0.4833570323786107


In [73]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'country', 'actor_2_name', 'language', 'movie_imdb_link', 'genres', 'content_rating', 'movie_title', 'plot_keywords', 'actor_1_name', 'director_name']

error and metrics
0.5946522522522522
R square: 0.4716388622830292


In [74]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'country', 'actor_2_name', 'language', 'movie_imdb_link', 'genres', 'content_rating', 'movie_title', 'plot_keywords', 'actor_1_name', 'director_name']

error and metrics
0.600863063063063
R square: 0.4671140420312152


In [75]:
#dataset 9

dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 11\Adult ICU patients project.xlsx")

In [76]:
dataset1 = remove_nullvalues(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)


(791, 30)
(791,)
(390, 30)
(390,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['If previous question is Yes, Specify:', 'Nationality', 'DATE OF BIRTH', 'MRN']

error and metrics
[[ 61  33]
 [ 21 275]]

0.8615384615384616


In [77]:
dataset1 = fillvaluesbymean(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['If previous question is Yes, Specify:', 'Nationality', 'DATE OF BIRTH', 'MRN']

error and metrics
[[ 98  29]
 [ 24 279]]

0.8767441860465116


In [78]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['If previous question is Yes, Specify:', 'Nationality', 'DATE OF BIRTH', 'MRN']

error and metrics
[[ 59  26]
 [ 21 324]]

0.8906976744186047


In [79]:
dataset1 = fillvaluesbymode(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['If previous question is Yes, Specify:', 'Nationality', 'DATE OF BIRTH', 'MRN']

error and metrics
[[ 61  24]
 [ 24 321]]

0.8883720930232558


dataset1 = fillvaluesbybfill(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

#this wont work because the missing values will be continous without breaks

In [80]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 12\MISSING VALUES\2016 County Election Data.csv")

In [81]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(1993, 8)
(1993,)
(982, 8)
(982,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.813095926680244
R square: 0.7093988196510861


In [82]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.669159177532599
R square: 0.7288656462001042


In [83]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.594368015863054
R square: 0.7354732951165821


In [84]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.645967602808424
R square: 0.7299288328901252


In [85]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.583310030090269
R square: 0.7364833177021173


In [86]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 13\MISSING VALUES\churn.csv")

In [87]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3308, 17)
(3308,)
(1630, 17)
(1630,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 34 247  55   9   4   0   0   0   0   0]
 [ 52 434 100  16   5   0   0   0   0   0]
 [ 31 234  63   4   3   1   0   0   0   0]
 [ 19 145  36   9   2   0   0   0   0   0]
 [  8  43  12   5  11   4   0   0   0   0]
 [  3   5   1   1  13   1   0   0   0   0]
 [  1   7   1   0   3   1   0   0   0   0]
 [  2   2   0   0   1   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.33865030674846625


In [88]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 31 281  27   2   4   0   0   0   0   0]
 [ 47 448  62   6   9   1   0   0   0   0]
 [ 35 279  61   7   3   0   0   0   0   0]
 [ 20 162  25   5   2   0   0   0   0   0]
 [  5  53  10   0  16   3   0   0   0   0]
 [  2  13   2   1  12   0   0   0   0   0]
 [  0   4   1   0   4   1   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.34


In [89]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 37 269  32   2   5   0   0   0   0   0]
 [ 58 433  66   7   9   0   0   0   0   0]
 [ 26 295  58   2   4   0   0   0   0   0]
 [ 23 166  20   3   2   0   0   0   0   0]
 [  9  51   8   0  17   2   0   0   0   0]
 [  3  12   2   1  12   0   0   0   0   0]
 [  0   3   2   0   4   0   1   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.3327272727272727


In [90]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 31 279  30   2   3   0   0   0   0   0]
 [ 54 433  65   9  11   1   0   0   0   0]
 [ 35 285  54   6   5   0   0   0   0   0]
 [ 20 158  29   6   0   1   0   0   0   0]
 [  7  52  13   0  12   3   0   0   0   0]
 [  2  13   6   0   8   1   0   0   0   0]
 [  1   4   1   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.32545454545454544


In [91]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 22 281  33   4   5   0   0   0   0   0]
 [ 63 430  63   8   9   0   0   0   0   0]
 [ 45 277  51   9   3   0   0   0   0   0]
 [ 17 166  23   6   1   1   0   0   0   0]
 [  8  53   9   0  13   4   0   0   0   0]
 [  1  13   3   1  12   0   0   0   0   0]
 [  0   4   1   0   5   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.31636363636363635


In [92]:
#dataset11

dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 14\MISSING VALUES\healthcare-dataset-stroke-data.csv",sep=",")

In [93]:
dataset1 = remove_nullvalues(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['stroke'] = dataset1['stroke'].astype('int')


(3241, 11)
(3241,)
(1597, 11)
(1597,)

Categorical columns that will be ordinal encoded: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1518    1]
 [  78    0]]

0.9505322479649343


In [94]:
dataset1 = fillvaluesbymean(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1590    1]
 [  96    0]]

0.942501481920569


In [95]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1588    3]
 [  96    0]]

0.941315945465323


In [96]:
dataset1 = fillvaluesbymode(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [97]:
dataset1 = fillvaluesbybfill(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1590    1]
 [  96    0]]

0.942501481920569


In [98]:
#dataset 12
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 15\MISSING VALUES\abalone.csv",sep=",")

In [99]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2767, 8)
(2767,)
(1364, 8)
(1364,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.556884164222874
R square: 0.5484641852635683


In [100]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5736693255982597
R square: 0.5146481023386247


In [101]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5725235678027558
R square: 0.5167663120234325


In [102]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5799347353154463
R square: 0.514211110947947


In [103]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5733502538071067
R square: 0.516507909647468


In [104]:
#dataset 13
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 16\MISSING VALUES\Pokemon.csv",sep=",")

In [105]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(369, 12)
(369,)
(183, 12)
(183,)

Categorical columns that will be ordinal encoded: ['type1', 'type2']
Categorical columns that will be dropped from the dataset: ['name']

error and metrics
[[151   8]
 [  3  21]]

0.9398907103825137


In [107]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[308   4]
 [  5  37]]

0.9745762711864406


In [108]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[307   5]
 [  4  38]]

0.9745762711864406


In [109]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[310   2]
 [  5  37]]

0.980225988700565


In [110]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[309   3]
 [  5  37]]

0.9774011299435028
