# Install Dependencies

In [1]:
# Initial imports
import numpy as np
import pandas as pd
from path import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns

# Import and Clean Data

In [2]:
# Read in data
data = Path("../Resources/mpg.csv")
df = pd.read_csv(data, na_values='?')

# Drop rows with missing values
df.dropna(inplace=True)

# Drop 'car name' column
df = df.drop(["car name"],1)

# Convert 'origin' to a string
df['origin'] = df['origin'].astype(str) 

# Encode Categorical Data

In [3]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[["origin"]]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(["origin"])

# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop("origin",1)

# Train Test Split

In [5]:
# Split our preprocessed data into our features and target arrays
X = df.drop(columns=['mpg'])
y = df['mpg']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [10]:
# Calculate the `mean` and `std` for each feature
X_train.describe().transpose()[['mean', 'std']]

Unnamed: 0,mean,std
cylinders,5.356401,1.683551
displacement,188.468858,103.019158
horsepower,102.183391,37.864767
weight,2924.584775,847.660527
acceleration,15.60173,2.6518
model year,75.861592,3.682006
origin_1,0.615917,0.487221
origin_2,0.176471,0.381881
origin_3,0.207612,0.406301


# Visualize Relationships

In [6]:
# Bonus: Create a pairplot ot visualize the relationship between the features: 
# ['mpg', 'cylinders', 'displacement', 'weight', 'acceleration']
