In [None]:
import pandas as pd

data = pd.read_csv('vehicles/vehicles.csv')
data

In [None]:
data.columns

In [None]:
#rename all columns
data.columns = ['Manufacturer','Model','Year','Displacement',
                'Cylinders','Transmission','Drivetrain',
                'Vehicle Class','Fuel Type','Fuel Barrels/Year',
                'City MPG','Highway MPG','Combined MPG',
                'CO2 Emission Grams/Mile','Fuel Cost/Year']

In [None]:
data.head()

In [None]:
#rename a few columns
data = data.rename(columns={'Manufacturer':'Make',
                            'Displacement':'Engine Displacement'})

data.columns

In [None]:
#change column order
column_order = ['Year','Make','Model','Vehicle Class',
                'Transmission','Drivetrain','Fuel Type',
                'Cylinders','Engine Displacement','Fuel Barrels/Year',
                'City MPG','Highway MPG','Combined MPG',
                'CO2 Emission Grams/Mile','Fuel Cost/Year']

data = data[column_order]


In [None]:
# data[['Fuel Cost/Year','Year','Make','Model','Vehicle Class',
#                 'Transmission','Drivetrain','Fuel Type',
#                 'Cylinders','Engine Displacement','Fuel Barrels/Year',
#                 'City MPG','Highway MPG','Combined MPG',
#                 'CO2 Emission Grams/Mile']]
data.head()

In [None]:
set(data['Fuel Type'])

In [None]:
# filtering - selecting rows on conditions
filtered = data[(data['Make']=='Ford') & 
                (data['Cylinders']>=6) &
                (data['Combined MPG'] < 18)]

filtered

In [None]:
filtered = data[(data['Make'].isin(['Subaru','Ford'])) & 
                (data['Cylinders']==4) &
                (data['Year'] < 2000)]

filtered

In [None]:
vwdiesels = data[(data['Make']=='Volswagen') &
                 (data['Cylinders']<=6) &
                 (data['Fuel Type']=='Diesel')]
vwdiesels.shape

In [None]:
# Binning values
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']


In [None]:
data['bins']= pd.cut(data['Combined MPG'],5, labels=mpg_labels)
data['bins'].value_counts()

In [None]:
data['bins'] = pd.qcut(data['Combined MPG'],5, labels=mpg_labels)
data['bins'].value_counts()

In [None]:
cutoffs = [0,14,21,23,30,40]
bins = pd.cut(data['Combined MPG'],cutoffs, labels=mpg_labels)
bins.head(10)

In [None]:
# add bin-names as a column
data['bins'] = pd.cut(data['Combined MPG'],cutoffs, labels=mpg_labels)
data.head()

In [None]:
data=data[['Year', 'Make', 'Model', 'Vehicle Class', 'Transmission', 'Drivetrain',
       'Fuel Type', 'Cylinders', 'Engine Displacement', 'Fuel Barrels/Year',
       'City MPG', 'Highway MPG', 'Combined MPG','bins', 'CO2 Emission Grams/Mile',
       'Fuel Cost/Year']]
data.head()

In [None]:
data = data.drop(columns=['bins'], axis=1)
data.head()

In [None]:
# create categorical column
data.loc[data['Transmission'].str.startswith('A'), 'TransType'] = 'Automatic'
data.loc[data['Transmission'].str.startswith('M'), 'TransType'] = 'Manual'
data.head(60)

In [None]:
# get unique values from a column
set(data['Drivetrain'])

In [None]:
data['Drivetrain'].unique()

In [None]:
# one-hot encoding (get_dummies)
drivetrain = pd.get_dummies(data['Drivetrain'])
drivetrain.head(10)

In [None]:
# merge datasets
# step 1: create dataset to merge
avg_mpg = data.groupby('Make', as_index=False).agg({'Combined MPG':'mean'})
avg_mpg.columns = ['Make', 'Avg_MPG']
avg_mpg.head()

In [None]:
# step 2: merge (similar to inner join in sql)
data = pd.merge(data, avg_mpg, on='Make')
data.head(10)

In [None]:
# concatenate dataframes - add columns, equivalent to 'join on index value')
data = pd.concat([data, drivetrain], axis=1)
data.head(20)

In [None]:
#concatenate dataframes - stacking rows, equivalent to sql union
lexus = data[data['Make']=='Lexus']
audi = data[data['Make']=='Audi']

lexus_audi = pd.concat([lexus, audi], axis=0)
set(lexus_audi['Make'])
#lexus_audi

In [None]:
data=data[['Year','Make','Model','City MPG','Highway MPG','Combined MPG']]
data

In [None]:
#melting a dataframe from wide format (row for each object) 
#                      to long format (row for each variable for each object)
melted = pd.melt(data, id_vars=['Year','Make','Model'], 
                 value_vars=['City MPG','Highway MPG','Combined MPG'])
melted
melted[(melted['Year']==1984)&(melted['Make']=='AM General')&(melted['Model']=='DJ Po Vehicle 2WD')]