In [68]:
import pandas as pd

auto_mpg_data = pd.read_csv('AutoMPG.csv')
print(auto_mpg_data.shape)

(398, 8)


Functions:

In [69]:
def calculate_median(dataset):
    sorted_data = sorted(dataset)
    return sorted_data[len(sorted_data)//2]

def quantile_(column, quantile_value):
    sorted_column = sorted(column)
    
    n = len(sorted_column)
    index = (n - 1) * quantile_value
    
    lower_index = int(index)
    upper_index = lower_index + 1 if lower_index + 1 < n else lower_index
    frac_part = index - lower_index
    
    if frac_part == 0:
        return sorted_column[lower_index]
    else:
        lower_value = sorted_column[lower_index]
        upper_value = sorted_column[upper_index]
        return lower_value + (upper_value - lower_value) * frac_part


A Part:

In [70]:
# changed ? to median horsepower value
auto_mpg_data['horsepower'] = pd.to_numeric(auto_mpg_data['horsepower'].replace('?', None))
mean_horsepower = calculate_median(auto_mpg_data['horsepower'])
auto_mpg_data['horsepower'].fillna(mean_horsepower, inplace=True)

# there are no non numeric ordinal attributes in the dataset

# doing one hot encoding for discrete attributes like origin and cylinders
df_encoded = pd.get_dummies(auto_mpg_data, columns=['cylinders', 'origin'], prefix=['cyl', 'origin'])
df = df_encoded.replace({True: 1, False: 0})

#removing outliers and imputing median in place of them
for column in ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year']:
    Q1 = quantile_(df[column],0.25)
    Q3 = quantile_(df[column],0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mediann=calculate_median(df[column])
    df.loc[df[column] > upper_bound, column] = mediann
    df.loc[df[column] < lower_bound, column] = mediann

#saving this into a file
df.to_csv('AutoMPG_cleaned.csv', index=False)

B Part:

In [71]:
df = pd.read_csv('AutoMPG_cleaned.csv')
print(df.shape)

mean_x ={}
variance = {}
for col in df.columns:
    mean_val=sum(df[col])/len(df[col])
    mean_x[col]=mean_val
    var=0
    for i in df[col]:
        var+=(i-mean_val)**2
    variance[col]=var/len(df[col])
    
print(mean_x)
print(variance)

(398, 14)
{'mpg': 23.455276381909552, 'displacement': 193.42587939698493, 'horsepower': 101.2537688442211, 'weight': 2972.288944723618, 'acceleration': 15.535175879396968, 'model year': 76.0, 'cyl_3': 0.010050251256281407, 'cyl_4': 0.5125628140703518, 'cyl_5': 0.007537688442211055, 'cyl_6': 0.21105527638190955, 'cyl_8': 0.25879396984924624, 'origin_1': 0.6256281407035176, 'origin_2': 0.17587939698492464, 'origin_3': 0.1984924623115578}
{'mpg': 59.59423095376376, 'displacement': 10844.882068950259, 'horsepower': 1096.7572094139036, 'weight': 713342.9742998912, 'acceleration': 6.446451099719708, 'model year': 13.597989949748744, 'cyl_3': 0.009949243705967031, 'cyl_4': 0.2498421757026331, 'cyl_5': 0.007480871695159248, 'cyl_6': 0.16651094669326397, 'cyl_8': 0.19181965101891388, 'origin_1': 0.23421757026337828, 'origin_2': 0.14494583470114322, 'origin_3': 0.15909320471705204}


C Part:

In [72]:
# variance is highly dominated by mpg, displacement, horsepower and weight

for col in ['mpg','displacement', 'horsepower', 'weight']:
    df[col]=(df[col]-mean_x[col])/variance[col]

df_normalized = df
mean_x_nomalised={}
variance_nomalised = {}
for col in df.columns:
    mean_val=sum(df[col])/len(df[col])
    mean_x_nomalised[col]=mean_val
    var=0
    for i in df[col]:
        var+=(i-mean_val)**2
    variance_nomalised[col]=var/len(df[col])

# print(df_normalized.head(10))
# print(mean_x_nomalised)
# print(variance_nomalised)

D Part:

In [73]:
#preprocessing: label encoding for cylinders using sklearn
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
auto_mpg_data['num_cylinders'] = label_encoder.fit_transform(auto_mpg_data['cylinders'])

contingency_table = pd.crosstab(df['model year'], auto_mpg_data['cylinders'])
observed = contingency_table.values

row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)
total = observed.sum()

expected = [[0 for _ in range(len(col_totals))] for _ in range(len(row_totals))]
for i in range(len(row_totals)):
    for j in range(len(col_totals)):
        expected[i][j] = (row_totals[i] * col_totals[j]) / total

chi_squared_stat = 0
for i in range(len(observed)):
    for j in range(len(observed[i])):
        if expected[i][j] != 0:
            chi_squared_stat += ((observed[i][j] - expected[i][j]) ** 2) / expected[i][j]


rows, cols = observed.shape
dof = (rows - 1) * (cols - 1)

print("Chi-squared Statistic:", chi_squared_stat)
print("Degrees of Freedom:", dof)

if chi_squared_stat > 65.171:  # critical value for dof=48 and alpha=0.05 seen from the table attached
    print("Model year affects the number of cylinders.")
else:
    print("Model year does not affect the number of cylinders.")

Chi-squared Statistic: 121.05451233218376
Degrees of Freedom: 48
Model year affects the number of cylinders.
