In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import csv 

## Subtasks for Data Set 2 (Adult Data Set):


In [4]:
# Reading dataset and printing missing value information

df = pd.read_csv('./Adult Data Set/adult.data', index_col = False, header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'])
df.replace(' ?', np.nan, inplace=True)
missing_values_table(df)

# To handle null values:
# - can delete the entire row with null values
# - replace with most frequent value in the column
# - make a classifying algorithm which trained on the rows which don't have null values, and then tested on null to determine output


# One hot encoding using pandas get_dummies and then dropping 
# one category for each categorial feature to preserve linear dependency (https://datascience.stackexchange.com/questions/27957/why-do-we-need-to-discard-one-dummy-variable/27993#27993)

df['workclass'] = df['workclass'].astype('category')
df['education'] = df['education'].astype('category')
df['marital-status'] = df['marital-status'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['relationship'] = df['relationship'].astype('category')
df['race'] = df['race'].astype('category')
df['sex'] = df['sex'].astype('category')
df['native-country'] = df['native-country'].astype('category')
df = pd.get_dummies(df , drop_first= True)

df.head(50)

Your selected dataframe has 14 columns.
There are 3 columns that have missing values.


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,37,284582,14,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,49,160187,5,0,0,16,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,52,209642,9,0,0,45,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,31,45781,14,14084,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9,42,159449,13,5178,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
def missing_values_table(df):
   
    # Total missing values
    mis_val = df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    # .iloc[:, 1]!= 0: filter on missing missing values not equal to zero
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(2)  # round(2), keep 2 digits
    
    # Print some summary information
    print("Your selected dataframe has {} columns.".format(df.shape[1]) + '\n' + 
    "There are {} columns that have missing values.".format(mis_val_table_ren_columns.shape[0]))
    
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

<h2>Subtasks for Data Set 3 (Finn's Data Set BABY):</h2>

<body>
    <h3>Step One</h3>
    
    - First we must acquire, preprocess, and analyze the data
</body>

In [None]:
# Reading dataset into a dataframe
DataFrame = pd.read_csv('./Auto MPG Data Set/auto-mpg.data', index_col = False, header=None, names=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model-year', 'origin', 'car-name'], delim_whitespace=True)

# As we can see from the .names file, only horsepower has missing values - there are 6 such rows without it.
# I've decided to remove these since horsepower is not categorical, and the instructions say: "You should
# remove any examples with missing or malformed features and note this in your report".

# Get names of indexes for which column horsepower has value '?'
indexNames = DataFrame[ (DataFrame['horsepower'] == '?') ].index
# Delete these row indexes from dataFrame
DataFrame.drop(indexNames , inplace=True)
# Change type of column to float now that missing values are gone
DataFrame =DataFrame.astype({'horsepower': float})


# However, now we see that car name is kind of a useless value as they all are different models.
# We will parse it so it is only the car brand, making it a multi-valued discrete feature
DataFrame['car-name'] = DataFrame['car-name'].apply(lambda x: x.split()[0])


# Now we display the first few rows, looks good!
DataFrame.head(10)


In [None]:
#Our data is clean and ready to use! Let's analyze it a bit
DataFrame.describe()

In [None]:
# An interesting observation, displacement is not very nicely distributed (also what is it??)
#Also, this leaves out the car name, I'm curious what the most popular brand is and what not (ford)
DataFrame["car-name"].describe()

In [None]:
# Now lets look at some graphs, I'm curious how some of the feautures plot against MPG
# First lets plot the continuos variables, they all have pretty clear trends, although acceleration
# is slightly less clear
DataFrame.plot(kind='scatter',x='displacement',y='mpg',color='red')
DataFrame.plot(kind='scatter',x='horsepower',y='mpg',color='blue')
DataFrame.plot(kind='scatter',x='displacement',y='mpg',color='green')
DataFrame.plot(kind='scatter',x='weight',y='mpg',color='orange')
DataFrame.plot(kind='scatter',x='acceleration',y='mpg',color='cyan')
plt.show()



In [None]:
# Now I would like to look at the discrete variables
DataFrame.plot(kind='scatter',x='cylinders',y='mpg',color='red')
DataFrame.plot(kind='scatter',x='model-year',y='mpg',color='blue')
DataFrame.plot(kind='scatter',x='origin',y='mpg',color='orange')
DataFrame.plot(kind='scatter',x='car-name',y='mpg',color='green')
plt.show()