In [49]:
import pandas as pd

auto_mpg_data = pd.read_csv('AutoMPG.csv')
print(auto_mpg_data.shape)

(398, 8)


A Part:

In [50]:
# changed ? to median horsepower value
auto_mpg_data['horsepower'] = pd.to_numeric(auto_mpg_data['horsepower'].replace('?', None))
mean_horsepower = auto_mpg_data['horsepower'].median()
auto_mpg_data['horsepower'].fillna(mean_horsepower, inplace=True)

# there are no non numeric ordinal attributes in the dataset

# doing one hot encoding for discrete attributes like origin and cylinders
df_encoded = pd.get_dummies(auto_mpg_data, columns=['cylinders', 'origin'], prefix=['cyl', 'origin'])
df = df_encoded.replace({True: 1, False: 0})

#removing outliers and imputing median in place of them
for column in ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year']:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mediann=int(df[column].median())
    df.loc[df[column] > upper_bound, column] = mediann
    df.loc[df[column] < lower_bound, column] = mediann

#saving this into a file
df.to_csv('AutoMPG_cleaned.csv', index=False)

B Part:

In [51]:
df = pd.read_csv('AutoMPG_cleaned.csv')
print(df.shape)

mean_x ={}
variance = {}
for col in df.columns:
    mean_val=sum(df[col])/len(df[col])
    mean_x[col]=mean_val
    var=0
    for i in df[col]:
        var+=(i-mean_val)**2
    variance[col]=var/len(df[col])
    
print(mean_x)
print(variance)

(398, 14)
{'mpg': 23.455276381909552, 'displacement': 193.42587939698493, 'horsepower': 100.87437185929649, 'weight': 2972.278894472362, 'acceleration': 15.526381909547723, 'model year': 76.0, 'cyl_3': 0.010050251256281407, 'cyl_4': 0.5125628140703518, 'cyl_5': 0.007537688442211055, 'cyl_6': 0.21105527638190955, 'cyl_8': 0.25879396984924624, 'origin_1': 0.6256281407035176, 'origin_2': 0.17587939698492464, 'origin_3': 0.1984924623115578}
{'mpg': 59.59423095376376, 'displacement': 10844.882068950259, 'horsepower': 1095.4729110375995, 'weight': 713346.3367907378, 'acceleration': 6.451389421984299, 'model year': 13.597989949748744, 'cyl_3': 0.009949243705967031, 'cyl_4': 0.2498421757026331, 'cyl_5': 0.007480871695159248, 'cyl_6': 0.16651094669326397, 'cyl_8': 0.19181965101891388, 'origin_1': 0.23421757026337828, 'origin_2': 0.14494583470114322, 'origin_3': 0.15909320471705204}


C Part:

In [52]:
# variance is highly dominated by mpg, displacement, horsepower and weight

for col in ['mpg','displacement', 'horsepower', 'weight']:
    df[col]=(df[col]-mean_x[col])/variance[col]

df_normalized = df
mean_x_nomalised={}
variance_nomalised = {}
for col in df.columns:
    mean_val=sum(df[col])/len(df[col])
    mean_x_nomalised[col]=mean_val
    var=0
    for i in df[col]:
        var+=(i-mean_val)**2
    variance_nomalised[col]=var/len(df[col])

# print(df_normalized.head(10))
# print(mean_x_nomalised)
# print(variance_nomalised)

D Part:

In [53]:
#preprocessing: label encoding for cylinders using sklearn
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
auto_mpg_data['num_cylinders'] = label_encoder.fit_transform(auto_mpg_data['cylinders'])

from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df['model year'], auto_mpg_data['cylinders'])
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

alpha = 0.05

print("Chi-squared Statistic:", chi2_stat)
print("Degrees of Freedom:", dof)
print("P-value:", p_value)
print("Expected Frequencies:\n", expected)

# Conclusion
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant effect of model year on the number of cylinders.")
else:
    print("Fail to reject the null hypothesis: There is no significant effect of model year on the number of cylinders.")

Chi-squared Statistic: 121.05451233218379
Degrees of Freedom: 48
P-value: 3.0677113413769835e-08
Expected Frequencies:
 [[ 0.29145729 14.86432161  0.21859296  6.12060302  7.50502513]
 [ 0.28140704 14.35175879  0.21105528  5.90954774  7.24623116]
 [ 0.28140704 14.35175879  0.21105528  5.90954774  7.24623116]
 [ 0.40201005 20.50251256  0.30150754  8.44221106 10.35175879]
 [ 0.27135678 13.83919598  0.20351759  5.69849246  6.98743719]
 [ 0.30150754 15.37688442  0.22613065  6.33165829  7.7638191 ]
 [ 0.35175879 17.93969849  0.2638191   7.38693467  9.05778894]
 [ 0.28140704 14.35175879  0.21105528  5.90954774  7.24623116]
 [ 0.36180905 18.45226131  0.27135678  7.59798995  9.31658291]
 [ 0.29145729 14.86432161  0.21859296  6.12060302  7.50502513]
 [ 0.28140704 14.35175879  0.21105528  5.90954774  7.24623116]
 [ 0.29145729 14.86432161  0.21859296  6.12060302  7.50502513]
 [ 0.31155779 15.88944724  0.23366834  6.54271357  8.02261307]]
Reject the null hypothesis: There is a significant effect of