In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import QuantileTransformer,RobustScaler
from sklearn.preprocessing import PowerTransformer,MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [2]:
def maxabs_scaler(X_train,X_test):
    rs = MaxAbsScaler()
    X_train4 = X_train.copy()
    X_test4  = X_test.copy()
    
    X_train4 = rs.fit_transform(X_train4)
    X_test4 = rs.transform(X_test4)
    
    return X_train4,X_test4

In [3]:
def robust_scaler(X_train,X_test):
    rs = RobustScaler()
    X_train3 = X_train.copy()
    X_test3  = X_test.copy()
    
    X_train3 = rs.fit_transform(X_train3)
    X_test3 = rs.transform(X_test3)
    
    return X_train3,X_test3

In [4]:
def quantile_transforemer_scaler(X_train,X_test):
    qts = QuantileTransformer()
    X_train5 = X_train.copy()
    X_test5  = X_test.copy()
    
    X_train5 = qts.fit_transform(X_train5)
    X_test5 = qts.transform(X_test5)
    
    return X_train5,X_test5

In [5]:
def min_max_scaler(X_train,X_test):
    mn = MinMaxScaler()
    X_train2 = X_train.copy()
    X_test2  = X_test.copy()
    
    X_train2 = mn.fit_transform(X_train2)
    X_test2 = mn.transform(X_test2)
    
    return X_train2,X_test2

In [6]:
def power_transforemer_scaler(X_train,X_test):
    pts = PowerTransformer()
    X_train6 = X_train.copy()
    X_test6  = X_test.copy()
    
    X_train6 = pts.fit_transform(X_train6)
    X_test6 = pts.transform(X_test6)
    
    return X_train6,X_test6

In [7]:
def standard_scaling(X_train,X_test):
    sc = StandardScaler()
    X_train1 = X_train.copy()
    X_test1  = X_test.copy()
    
    X_train1 = sc.fit_transform(X_train1)
    X_test1 = sc.transform(X_test1)
    
    return X_train1,X_test1

In [8]:
def split_train_test_data(dataset,predicting_label):
    
    from sklearn.model_selection import train_test_split
    y = dataset[predicting_label]
    X = dataset.drop(predicting_label,axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [9]:
def cardinality_cols(X_train):
    
    low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"] 
    high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 10 and X_train[cname].dtype == "object"] 
    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']] 
    
    return low_cardinality_cols,high_cardinality_cols,numerical_cols
    
    

In [10]:
def spliteddata_shape(X_train,X_test,y_train,y_test):
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [11]:
def score_dataset(X_train,X_test,y_train,y_test,predicted_label_type="object"):
    from sklearn.metrics import mean_absolute_error,confusion_matrix
    from sklearn.metrics import r2_score,accuracy_score
    
    if predicted_label_type=="object":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(confusion_matrix(y_test,preds))
        print()
        print(accuracy_score(y_test,preds))
    
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(mean_absolute_error(y_test, preds)) 
        print(f'R square: {r2_score(y_test,preds)}')

In [12]:
def drop_categorical(X_train,X_test,y_train,y_test):
    
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_test.select_dtypes(exclude=['object'])

    print("MAE from Approach 1 (Drop categorical variables):")
    score_dataset(drop_X_train, drop_X_valid, y_train, y_test)

In [13]:
def good_and_bad_labels(X_train,X_test):
    object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                       set(X_test[col]).issubset(set(X_train[col]))]
        
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
    print('Categorical columns that will be ordinal encoded:', good_label_cols)
    print('Categorical columns that will be dropped from the dataset:', bad_label_cols)
    
    return good_label_cols,bad_label_cols

In [14]:

def ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols):
    from sklearn.preprocessing import OrdinalEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)

    ordinal_encoder = OrdinalEncoder()
    label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])
    
    return label_X_train,label_X_valid

In [15]:
def remove_nullvalues(dataset):
    dataset1 = dataset.copy()
    dataset1 = dataset.dropna(axis=0)
    return dataset1

In [16]:
def fillvaluesbymean(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.mean())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [17]:
def fillvaluesbymedian(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.median())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [18]:
def fillvaluesbymode(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    for i in a.columns:
        dataset1[i] = dataset1[i].fillna(dataset1[i].mode()[0])
    dataset1[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [19]:
def fillvaluesbybfill(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(method='bfill')
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [20]:
#dataset1

In [21]:
import pandas as pd 
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\MY2022 Fuel Consumption Ratings.xlsx")

In [22]:
#remove null values
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(615, 14)
(615,)
(303, 14)
(303,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']

error and metrics
1.721737309445229
R square: 0.9952836980832745


In [23]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']

error and metrics
3.1321649170850443
R square: 0.9698214946328316


In [24]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']

error and metrics
3.0944659972615236
R square: 0.9667388042177254


In [25]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']

error and metrics
3.0469739084132073
R square: 0.9657567077865404


In [26]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"CO2 Emissions(g/km)")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(633, 14)
(633,)
(313, 14)
(313,)

Categorical columns that will be ordinal encoded: ['Make', 'Vehicle Class', 'Fuel Type']
Categorical columns that will be dropped from the dataset: ['Model', 'Transmission']

error and metrics
2.6247978092195354
R square: 0.9795439303117812


In [27]:
#dataset2

In [28]:
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 2\MISSING VALUES\data.csv")

In [29]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(349, 31)
(349,)
(172, 31)
(172,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[100   3]
 [  6  63]]

0.9476744186046512


In [30]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  6  61]]

0.9521276595744681


In [31]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  4  63]]

0.9627659574468085


In [32]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[117   4]
 [  4  63]]

0.9574468085106383


In [33]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  5  62]]

0.9574468085106383


In [34]:
#dataset 3
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 3\MISSING VALUES\winequality-red.csv")

In [35]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1046, 11)
(1046,)
(516, 11)
(516,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   1   0   0   0]
 [  1   0  10   3   1   0]
 [  0   0 177  40   1   0]
 [  0   0  55 141  11   1]
 [  0   0   6  28  32   0]
 [  0   0   0   6   2   0]]

0.6782945736434108


In [36]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 165  50   2   0]
 [  0   0  47 150  16   0]
 [  0   0   0  41  28   1]
 [  0   0   0   0   6   1]]

0.6515151515151515


In [37]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 165  49   3   0]
 [  0   0  48 148  17   0]
 [  0   0   0  42  27   1]
 [  0   0   0   1   5   1]]

0.6458333333333334


In [38]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 166  49   2   0]
 [  0   0  51 143  19   0]
 [  0   0   2  39  28   1]
 [  0   0   0   0   6   1]]

0.6401515151515151


In [39]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1071, 11)
(1071,)
(528, 11)
(528,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  0   0   2   0   0   0]
 [  0   0  11   8   0   0]
 [  0   0 162  53   2   0]
 [  0   0  51 146  16   0]
 [  0   0   2  37  30   1]
 [  0   0   0   0   6   1]]

0.6420454545454546


In [40]:
#datset 4
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 4\MISSING VALUES\diabetes.csv")

In [41]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(489, 8)
(489,)
(242, 8)
(242,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[128  27]
 [ 33  54]]

0.7520661157024794


In [42]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[137  31]
 [ 32  54]]

0.7519685039370079


In [43]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[139  29]
 [ 36  50]]

0.7440944881889764


In [44]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[134  34]
 [ 31  55]]

0.7440944881889764


In [45]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(514, 8)
(514,)
(254, 8)
(254,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[139  29]
 [ 34  52]]

0.7519685039370079


In [46]:
#dataset 5
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 5\MISSING VALUES\Child Immunization Dataset.xls")

In [47]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(434, 39)
(434,)
(214, 39)
(214,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[112   3]
 [  6  93]]

0.9579439252336449


In [48]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[106  11]
 [  5 101]]

0.9282511210762332


In [49]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[110   7]
 [  4 102]]

0.9506726457399103


In [50]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[109   8]
 [  6 100]]

0.9372197309417041


In [51]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['IS_P', 'DISTRICT']

error and metrics
[[107  10]
 [  6 100]]

0.9282511210762332


In [52]:
#dataset6
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 6\MISSING VALUES\hmeq.csv")

In [53]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"BAD")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2253, 12)
(2253,)
(1111, 12)
(1111,)

Categorical columns that will be ordinal encoded: ['REASON', 'JOB']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1011    0]
 [  48   52]]

0.9567956795679567


In [75]:
#dataset7
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 7\MISSING VALUES\NFL.csv")

In [76]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(789, 17)
(789,)
(390, 17)
(390,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type']
Categorical columns that will be dropped from the dataset: ['Position', 'Player', 'Drafted..tm.rnd.yr.', 'School']

error and metrics
[[390]]

1.0


In [77]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'Drafted..tm.rnd.yr.', 'School']

error and metrics
[[178 213]
 [ 98 659]]

0.7290940766550522


In [78]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'Drafted..tm.rnd.yr.', 'School']

error and metrics
[[214 177]
 [ 49 708]]

0.8031358885017421


In [79]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2329, 17)
(2329,)
(1148, 17)
(1148,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Player', 'Drafted..tm.rnd.yr.', 'School']

error and metrics
[[173 218]
 [104 653]]

0.7195121951219512


dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

# this wont work because the missing values will be continous without breaks

In [80]:
#dataset 8
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\research paper\5 movie_metadata.csv")

In [81]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2516, 27)
(2516,)
(1240, 27)
(1240,)

Categorical columns that will be ordinal encoded: ['color', 'content_rating']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'movie_imdb_link', 'director_name', 'actor_1_name', 'plot_keywords', 'language', 'genres', 'country', 'movie_title', 'actor_2_name']

error and metrics
0.5317016129032258
R square: 0.5501116399028335


In [82]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'movie_imdb_link', 'director_name', 'actor_1_name', 'plot_keywords', 'content_rating', 'language', 'genres', 'country', 'movie_title', 'actor_2_name']

error and metrics
0.5906252252252252
R square: 0.47434854659636494


In [84]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'movie_imdb_link', 'director_name', 'actor_1_name', 'plot_keywords', 'content_rating', 'language', 'genres', 'country', 'movie_title', 'actor_2_name']

error and metrics
0.5905957957957958
R square: 0.48032383900785447


In [85]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'movie_imdb_link', 'director_name', 'actor_1_name', 'plot_keywords', 'content_rating', 'language', 'genres', 'country', 'movie_title', 'actor_2_name']

error and metrics
0.5935405405405405
R square: 0.4771603368598747


In [86]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"imdb_score")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 27)
(3378,)
(1665, 27)
(1665,)

Categorical columns that will be ordinal encoded: ['color']
Categorical columns that will be dropped from the dataset: ['actor_3_name', 'movie_imdb_link', 'director_name', 'actor_1_name', 'plot_keywords', 'content_rating', 'language', 'genres', 'country', 'movie_title', 'actor_2_name']

error and metrics
0.6006978978978977
R square: 0.46962381132483977


In [87]:
#dataset 9

dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 11\Adult ICU patients project.xlsx")

In [88]:
dataset1 = remove_nullvalues(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)


(791, 30)
(791,)
(390, 30)
(390,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['DATE OF BIRTH', 'MRN', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 64  30]
 [ 23 273]]

0.8641025641025641


In [89]:
dataset1 = fillvaluesbymean(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['DATE OF BIRTH', 'MRN', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[100  27]
 [ 21 282]]

0.8883720930232558


In [90]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['DATE OF BIRTH', 'MRN', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 60  25]
 [ 21 324]]

0.8930232558139535


In [91]:
dataset1 = fillvaluesbymode(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['DATE OF BIRTH', 'MRN', 'If previous question is Yes, Specify:', 'Nationality']

error and metrics
[[ 56  29]
 [ 20 325]]

0.8860465116279069


dataset1 = fillvaluesbybfill(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

#this wont work because the missing values will be continous without breaks

In [92]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 12\MISSING VALUES\2016 County Election Data.csv")

In [93]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(1993, 8)
(1993,)
(982, 8)
(982,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.714967107942973
R square: 0.7128089942161608


In [94]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.54225295887663
R square: 0.7344128545707527


In [95]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.616325959458196
R square: 0.7332081839023319


In [96]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.535671815446339
R square: 0.7340667758837349


In [98]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.562182748244734
R square: 0.7363659123662829


In [99]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 13\MISSING VALUES\churn.csv")

In [100]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3308, 17)
(3308,)
(1630, 17)
(1630,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 39 253  47   6   4   0   0   0   0   0]
 [ 63 426  97  16   5   0   0   0   0   0]
 [ 32 248  49   4   3   0   0   0   0   0]
 [ 20 151  34   4   2   0   0   0   0   0]
 [  8  47  10   1  13   4   0   0   0   0]
 [  1   7   2   0  12   2   0   0   0   0]
 [  1   7   1   0   2   2   0   0   0   0]
 [  1   2   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.32699386503067485


In [101]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 32 278  28   3   4   0   0   0   0   0]
 [ 59 433  66   7   8   0   0   0   0   0]
 [ 40 285  52   4   4   0   0   0   0   0]
 [ 20 172  20   1   0   1   0   0   0   0]
 [  8  49  11   0  16   3   0   0   0   0]
 [  2  13   2   1  11   1   0   0   0   0]
 [  1   4   1   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.3242424242424242


In [102]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 32 275  30   4   4   0   0   0   0   0]
 [ 48 440  64  12   9   0   0   0   0   0]
 [ 39 293  47   3   3   0   0   0   0   0]
 [ 21 165  22   4   1   1   0   0   0   0]
 [  8  54   7   0  15   3   0   0   0   0]
 [  3  13   1   1  12   0   0   0   0   0]
 [  1   3   0   0   4   2   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0]]

0.32606060606060605


In [103]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 36 283  19   3   4   0   0   0   0   0]
 [ 59 431  68   6   9   0   0   0   0   0]
 [ 34 282  59   6   4   0   0   0   0   0]
 [ 18 167  23   4   1   1   0   0   0   0]
 [  9  50   9   1  15   3   0   0   0   0]
 [  1  15   3   0  11   0   0   0   0   0]
 [  0   4   2   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0]]

0.3303030303030303


In [105]:
#dataset11

dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 14\MISSING VALUES\healthcare-dataset-stroke-data.csv",sep=",")

In [106]:
dataset1 = remove_nullvalues(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['stroke'] = dataset1['stroke'].astype('int')


(3241, 11)
(3241,)
(1597, 11)
(1597,)

Categorical columns that will be ordinal encoded: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[1519    0]
 [  77    1]]

0.9517845961177207


In [107]:
dataset1 = fillvaluesbymean(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [108]:
dataset1 = fillvaluesbymedian(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [109]:
dataset1 = fillvaluesbymode(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [110]:
dataset1 = fillvaluesbybfill(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1590    1]
 [  95    1]]

0.943094250148192


In [111]:
#dataset 12
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 15\MISSING VALUES\abalone.csv",sep=",")

In [112]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2767, 8)
(2767,)
(1364, 8)
(1364,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.567851906158358
R square: 0.543043653360108


In [113]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.574887599709935
R square: 0.516831041417899


In [114]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.571950688905004
R square: 0.5190082345648546


In [115]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5802030456852794
R square: 0.5162492282038068


In [116]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5709064539521393
R square: 0.5171748742869584


In [117]:
#dataset 13
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 16\MISSING VALUES\Pokemon.csv",sep=",")

In [118]:
dataset1 = remove_nullvalues(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(369, 12)
(369,)
(183, 12)
(183,)

Categorical columns that will be ordinal encoded: ['type1', 'type2']
Categorical columns that will be dropped from the dataset: ['name']

error and metrics
[[150   9]
 [  0  24]]

0.9508196721311475


In [120]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[308   4]
 [  5  37]]

0.9745762711864406


In [121]:
dataset1 = fillvaluesbymedian(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[308   4]
 [  4  38]]

0.9774011299435028


In [122]:
dataset1 = fillvaluesbymode(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[309   3]
 [  5  37]]

0.9774011299435028


In [123]:
dataset1 = fillvaluesbybfill(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['type1', 'name']

error and metrics
[[309   3]
 [  6  36]]

0.9745762711864406
