# First steps with Machine Learning

## Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from utils.paths import DATA_RAW_DIR
from pathlib import Path

## Load dataset

In [2]:
# define path
path_data = DATA_RAW_DIR / "data.csv"

print(f"Data path: {path_data}")
print(f"Data exists: {path_data.exists()}")

Data path: /Users/jssdev/Dev/Learning/uc/mcd-machine-learning/data/raw/data.csv
Data exists: True


In [3]:
# define dataframe
df = pd.read_csv(path_data, sep=",", encoding="utf-8")
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## EDA

In [4]:
# initial exploration
print(f"Data shape: {df.shape}")
print("==" * 40)
print(f"Data columns: {df.columns.tolist()}")
print("==" * 40)
print(f"Data types:\n{df.dtypes}")
print("==" * 40)
print(f"Missing values:\n{df.isnull().sum()}")


Data shape: (10, 4)
Data columns: ['Country', 'Age', 'Salary', 'Purchased']
Data types:
Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object
Missing values:
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [5]:
print("==" * 40)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes
None


In [6]:
df.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
# data imputation with average for Age and Salary
df['Age'] = df['Age'].fillna(np.round(df['Age'].mean(), 1))
df['Salary'] = df['Salary'].fillna(np.round(df['Salary'].mean(), 1))

In [8]:
df.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.8,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.8,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [9]:
# define X and y
X = df.drop(columns=["Purchased"])
y = df["Purchased"]

In [10]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8, 3), (2, 3), (8,), (2,))