### Dataset used for Classification Task:
- Aeberhard,Stefan & Forina,M.. (1991). Wine. UCI Machine Learning Repository. https://doi.org/10.24432/C5PC7J.
### Dataset used for Regression Task:
- Schlimmer,Jeffrey. (1987). Automobile. UCI Machine Learning Repository. https://doi.org/10.24432/C5B01C.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#### Wine Dataset

In [2]:
wine = pd.read_csv('wine.csv')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   class                         178 non-null    int64  
 1   Alcohol                       178 non-null    float64
 2   Malicacid                     178 non-null    float64
 3   Ash                           178 non-null    float64
 4   Alcalinity_of_ash             178 non-null    float64
 5   Magnesium                     178 non-null    int64  
 6   Total_phenols                 178 non-null    float64
 7   Flavanoids                    178 non-null    float64
 8   Nonflavanoid_phenols          178 non-null    float64
 9   Proanthocyanins               178 non-null    float64
 10  Color_intensity               178 non-null    float64
 11  Hue                           178 non-null    float64
 12  0D280_0D315_of_diluted_wines  178 non-null    float64
 13  Proli

In [3]:
wine_train, wine_test = train_test_split(wine, test_size=0.2, random_state=77, stratify=wine['class'])
wine_train.shape, wine_test.shape

((142, 14), (36, 14))

In [4]:
#save as csv
wine_train.to_csv("wine_train.csv", encoding='utf-8', index=False)
wine_test.to_csv("wine_test.csv", encoding='utf-8', index=False)

#### Automobile

In [5]:
auto = pd.read_csv('Automobile.csv')
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [6]:
#Price (target) has 4 missing values, since we don't want the machine to learn from missing values, we move the empty price as test data
test_temp = auto[auto['price'].isnull()]
automobile = auto[auto['price'].notnull()]
#splitting data where there is no missing value in the price column
automobile_train, automobile_test = train_test_split(automobile, test_size=0.2 - (4/205), random_state=77)
automobile_train.shape, automobile_test.shape

((164, 26), (37, 26))

In [7]:
#adding data with missing values in 'price' to test data
automobile_test = pd.concat([automobile_test, test_temp])
#save as csv
automobile_train.to_csv("automobile_train.csv", encoding='utf-8', index=False)
automobile_test.to_csv("automobile_test.csv", encoding='utf-8', index=False)