In [82]:
%%writefile LiweiChen-L05-CategoryData.py

# 1. import necessary packages
from sklearn.preprocessing import *
import pandas as pd
import numpy as np

# 2. read in dataset and generate columns
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
auto_data = pd.read_csv(url,sep='\s+', header=None)
auto_data.columns = ["mpg", "cylinders", "displacement","horsepower","weight","acceleration","model_year", "origin", "car_name"]

# 3. Normalize numeric values 
auto_data[['mpg', 'displacement', 'weight', 'acceleration']] = StandardScaler().fit_transform(auto_data[['mpg', 'displacement','weight', 'acceleration']])

# 4. Bin mpg column into equal frequency percentiles
percentiles = np.linspace(0, 100, 5)
bounds = np.percentile(auto_data.loc[:,'mpg'], percentiles)
auto_data.loc[:,'mpg'] = pd.cut(auto_data.loc[:,'mpg'], bounds, labels = ['1', '2', '3', '4'])

# 5. Decode categorical data for origin column
auto_data.loc[auto_data.loc[:, "origin"] == 1, "origin"] = "USA"
auto_data.loc[auto_data.loc[:, "origin"] == 2, "origin"] = "Canada"
auto_data.loc[auto_data.loc[:, "origin"] == 3, "origin"] = "Mexico"

# 6. Impute missing categories for null values as unkonwn
auto_data.loc[pd.isnull(auto_data.loc[:, "origin"]), "origin"] = "Unknown"

# 7. Consolidate categorical data for origin into US vs non-US
auto_data.loc[auto_data.loc[:, "origin"] == "Canada", "origin"] = "non_US"
auto_data.loc[auto_data.loc[:, "origin"] == "Mexico", "origin"] = "non_US"

#8 One-hot encode origin column  
auto_data.loc[:, "non_US"] = (auto_data.loc[:, "origin"] == "non_US").astype(int)
auto_data.loc[:, "USA"] = (auto_data.loc[:, "origin"] == "USA").astype(int)

#9 Remove obsolete columns.
auto_data = auto_data.drop("origin", axis=1)

#10 Present plots for categorical columns.
auto_data.loc[:,"USA"].value_counts().plot(kind='bar')

# Summary:
# In this data, I only treated one category column: "origin".  I decoded 1, 2, 3 values as geographic locations of the cars
# Any null values were imputed to be "unknown", though in this data there were no missing values.
# For consolidation, I bucketed it into US vs non-US as the frequency of US is more than the other two combined.  
# I one-hot encoded the resulting consolidated categories and plotted them to demonstrate that even with the consolidation, US has the highest frequency


Writing LiweiChen-L05-CategoryData.py
