In [1]:
import pandas as pd

In [2]:
def split_train_test_data(dataset,predicting_label):
    
    from sklearn.model_selection import train_test_split
    y = dataset[predicting_label]
    X = dataset.drop(predicting_label,axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [3]:
def spliteddata_shape(X_train,X_test,y_train,y_test):
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [4]:
def fillvaluesbymean(dataset):
    dataset1 = dataset.copy()
    a = dataset1.select_dtypes('number')
    b = dataset1.select_dtypes('object')
    
    dataset1[a.columns] = a.fillna(a.mean())
    dataset[b.columns]  = b.fillna(b.agg(lambda x:x.mode().values[0]))
    
    return dataset1

In [5]:
def cardinality_cols(X_train):
    
    low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"] 
    high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 10 and X_train[cname].dtype == "object"] 
    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']] 
    
    return low_cardinality_cols,high_cardinality_cols,numerical_cols

In [6]:
def score_dataset(X_train,X_test,y_train,y_test,predicted_label_type="object"):
    from sklearn.metrics import mean_absolute_error,confusion_matrix
    from sklearn.metrics import r2_score,accuracy_score
    
    if predicted_label_type=="object":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(confusion_matrix(y_test,preds))
        print()
        print(accuracy_score(y_test,preds))
    
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(n_estimators=100, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(mean_absolute_error(y_test, preds)) 
        print(f'R square: {r2_score(y_test,preds)}')

In [7]:
def good_and_bad_labels(X_train,X_test):
    object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                       set(X_test[col]).issubset(set(X_train[col]))]
        
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
    print('Categorical columns that will be ordinal encoded:', good_label_cols)
    print('Categorical columns that will be dropped from the dataset:', bad_label_cols)
    
    return good_label_cols,bad_label_cols

In [8]:

def ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols):
    from sklearn.preprocessing import OrdinalEncoder
    # Drop categorical columns that will not be encoded
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_test.drop(bad_label_cols, axis=1)

    ordinal_encoder = OrdinalEncoder()
    label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    label_X_valid[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])
    
    return label_X_train,label_X_valid

In [9]:
def drop_duplicate_values(dataset):
    dataset1 = dataset.copy()
    dataset1.drop_duplicates(inplace=True)
    
    return dataset1

In [10]:
#dataset 1
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 2\DUPLICATE\data.csv")

In [11]:
dataset1 = drop_duplicate_values(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  5  62]]

0.9574468085106383


In [12]:
dataset1 = dataset.copy()
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"diagnosis")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(381, 31)
(381,)
(188, 31)
(188,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[118   3]
 [  5  62]]

0.9574468085106383


In [13]:
#dataset2
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND1\D1 copy 3\DUPLICATE\winequality-red.csv")

In [14]:
dataset1 = dataset.copy()
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1095, 12)
(1095,)
(540, 12)
(540,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  1   1   3   0   0   0]
 [  0   2  11   4   1   0]
 [  0   0 172  48   3   0]
 [  0   0  63 149  10   0]
 [  0   0   2  34  29   1]
 [  0   0   0   3   2   1]]

0.6555555555555556


In [15]:
dataset1 = drop_duplicate_values(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"quality")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1095, 12)
(1095,)
(540, 12)
(540,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[  1   1   3   0   0   0]
 [  0   2  11   4   1   0]
 [  0   0 172  48   3   0]
 [  0   0  63 149  10   0]
 [  0   0   2  34  29   1]
 [  0   0   0   3   2   1]]

0.6555555555555556


In [16]:
#dataset3
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 4\DUPLICATE\diabetes.csv")

In [17]:
dataset1 = drop_duplicate_values(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(550, 9)
(550,)
(271, 9)
(271,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[140  36]
 [ 29  66]]

0.7601476014760148


In [18]:
dataset1 = dataset.copy()
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(550, 9)
(550,)
(271, 9)
(271,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: []

error and metrics
[[140  36]
 [ 29  66]]

0.7601476014760148


In [19]:
#dataset 4
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 5\DUPLICATE\Child Immunization Dataset.xls")

In [20]:
dataset1 = drop_duplicate_values(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(451, 39)
(451,)
(223, 39)
(223,)

Categorical columns that will be ordinal encoded: ['FD', 'STATE', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT']

error and metrics
[[107  10]
 [  6 100]]

0.9282511210762332


In [21]:
dataset1 = dataset.copy()
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"IMR")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(472, 39)
(472,)
(233, 39)
(233,)

Categorical columns that will be ordinal encoded: ['FD', 'FS']
Categorical columns that will be dropped from the dataset: ['DISTRICT', 'STATE']

error and metrics
[[114  13]
 [  7  99]]

0.9141630901287554


In [22]:
#dataset 5
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\send 2\D1 copy 7\DUPLICATE\NFL.csv")

In [25]:
dataset1  = drop_duplicate_values(dataset1)
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2344, 17)
(2344,)
(1155, 17)
(1155,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Position_Type', 'Drafted..tm.rnd.yr.', 'School', 'Player']

error and metrics
[[240 165]
 [ 53 697]]

0.8112554112554112


In [26]:
dataset1 = fillvaluesbymean(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Drafted")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(2344, 17)
(2344,)
(1155, 17)
(1155,)

Categorical columns that will be ordinal encoded: ['Player_Type', 'Position']
Categorical columns that will be dropped from the dataset: ['Position_Type', 'Drafted..tm.rnd.yr.', 'School', 'Player']

error and metrics
[[240 165]
 [ 53 697]]

0.8112554112554112


In [31]:
#dataset 8
dataset = pd.read_excel(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 11\DUPLICATE\Adult ICU patients project.xlsx")

In [32]:
dataset1 = fillvaluesbymean(dataset)
dataset1['ICU Outcome'] = dataset1['ICU Outcome'].astype(int)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"ICU Outcome")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")



(871, 30)
(871,)
(430, 30)
(430,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['If previous question is Yes, Specify:', 'Nationality', 'MRN', 'DATE OF BIRTH']

error and metrics
[[ 88  26]
 [ 21 295]]

0.8906976744186047


In [33]:
#dataset 9
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 3\D1 copy 12\DUPLICATE\2016 County Election Data.csv",sep=",")

In [34]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2172, 8)
(2172,)
(1070, 8)
(1070,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.052086074766354
R square: 0.7674862216683865


In [35]:
dataset1 = fillvaluesbymean(dataset)
dataset1 = drop_duplicate_values(dataset1)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Clinton-lead")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2023, 8)
(2023,)
(997, 8)
(997,)

Categorical columns that will be ordinal encoded: []
Categorical columns that will be dropped from the dataset: ['County']

error and metrics
12.445744332998997
R square: 0.7440212442417453


In [36]:
#dataset 10
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 13\DUPLICATE\churn.csv",sep=",")

In [37]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3626, 17)
(3626,)
(1787, 17)
(1787,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 71 258  29  10   4   0   0   0   0   0]
 [ 55 495  63   6   3   1   0   0   0   0]
 [ 36 266  94   8   2   0   0   0   0   0]
 [ 21 166  16  29   1   0   0   0   0   0]
 [  5  53  10   0  30   1   0   0   0   0]
 [  3  13   3   1  11   5   0   0   0   0]
 [  0   2   1   0   7   0   0   0   0   0]
 [  2   0   1   0   3   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.4051482932288752


In [38]:
dataset1 = fillvaluesbymean(dataset)
dataset1 = drop_duplicate_values(dataset1)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"numbercustomerservicecalls")
spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3350, 17)
(3350,)
(1650, 17)
(1650,)

Categorical columns that will be ordinal encoded: ['churn', 'internationalplan', 'voicemailplan']
Categorical columns that will be dropped from the dataset: []

error and metrics
[[ 30 283  28   0   4   0   0   0   0   0]
 [ 60 435  63   7   8   0   0   0   0   0]
 [ 30 287  59   5   4   0   0   0   0   0]
 [ 26 162  21   4   0   1   0   0   0   0]
 [  6  56   8   0  14   3   0   0   0   0]
 [  2  14   1   1  12   0   0   0   0   0]
 [  0   5   1   0   4   0   0   0   0   0]
 [  1   1   0   0   2   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0   0   0]]

0.3284848484848485


In [39]:
#dataset 11
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 14\DUPLICATE\healthcare-dataset-stroke-data.csv",sep=",")

In [40]:
dataset1 = fillvaluesbymean(dataset)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3800, 11)
(3800,)
(1872, 11)
(1872,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1735    1]
 [  89   47]]

0.9519230769230769


In [41]:
dataset1 = fillvaluesbymean(dataset)
dataset1 = drop_duplicate_values(dataset1)
dataset1['stroke'] = dataset1['stroke'].astype('int')
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"stroke")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(3423, 11)
(3423,)
(1687, 11)
(1687,)

Categorical columns that will be ordinal encoded: ['ever_married', 'work_type', 'Residence_type', 'smoking_status']
Categorical columns that will be dropped from the dataset: ['gender']

error and metrics
[[1589    2]
 [  96    0]]

0.941908713692946


In [42]:
#dataset12
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 15\DUPLICATE\abalone.csv",sep=",")

In [43]:
dataset1 = fillvaluesbymean(dataset)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(3378, 8)
(3378,)
(1664, 8)
(1664,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.3771754807692307
R square: 0.6840222011460335


In [44]:
dataset1 = fillvaluesbymean(dataset)
dataset1 = drop_duplicate_values(dataset1)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"Rings")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"numeric")

(2798, 8)
(2798,)
(1379, 8)
(1379,)

Categorical columns that will be ordinal encoded: ['Sex']
Categorical columns that will be dropped from the dataset: []

error and metrics
1.5624510514865844
R square: 0.5201811346272673


In [45]:
#dataset 13
dataset = pd.read_csv(r"C:\Users\ishan\Downloads\ADA\DATASETS\SEND 4\D1 copy 16\DUPLICATE\Pokemon.csv",sep=",")

In [47]:
dataset1 = fillvaluesbymean(dataset)
dataset1 = drop_duplicate_values(dataset1)
X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(718, 12)
(718,)
(354, 12)
(354,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[308   4]
 [  6  36]]

0.9717514124293786


In [48]:
dataset1 = fillvaluesbymean(dataset)

X_train,X_test,y_train,y_test = split_train_test_data(dataset1,"legendary")

spliteddata_shape(X_train,X_test,y_train,y_test)
print()
low_cardinality_cols,high_cardinality_cols,numerical_cols = cardinality_cols(X_train)
good_label_cols,bad_label_cols = good_and_bad_labels(X_train,X_test)
X_train_encoded,X_test_encoded = ordinal_encoding(X_train,X_test,good_label_cols,bad_label_cols)
print()
print("error and metrics")
score_dataset(X_train_encoded,X_test_encoded,y_train,y_test,"object")

(1088, 12)
(1088,)
(537, 12)
(537,)

Categorical columns that will be ordinal encoded: ['type2']
Categorical columns that will be dropped from the dataset: ['name', 'type1']

error and metrics
[[476   3]
 [  5  53]]

0.9851024208566108
