In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.stats import linregress
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from feature_engine import EqualFrequencyDiscretiser

In [None]:
dataChina = pd.read_csv('1_china_gdp.csv')

x = dataChina['Year']
y = dataChina['Value']


# A. Nonlinear Transformation

In [None]:
y_log = np.log(y)
slope, intercept, rvalue, pvalue, stderr = linregress(x, y_log)
predict_y = np.exp(intercept) * np.exp(slope * x)

r2 = r2_score(y, predict_y)
print(f"The coefficient of determination = {r2}")

In [None]:
residuals = y - predict_y
plt.scatter(x, residuals, label='Residuals')
plt.axhline(y=0, label='Zero Residual Line')
plt.xlabel('X')
plt.ylabel('Residuals')
plt.legend()
plt.title('Residual Plot')
plt.grid(True)
plt.show()

# B. Categorical Encoding

In [None]:
dataMelb = pd.read_csv('2_melb_data.csv')
dataMelb.head()

In [None]:
categories = ['Suburb', 'Type', 'Method']
dataMelb[categories] = OrdinalEncoder().fit_transform(dataMelb[categories])

# C. Discretization

In [None]:
dataTrain =  pd.read_csv('3_train.csv')
dataTrain.head()

In [None]:
numerical = dataTrain.describe().columns.tolist()
X = dataTrain[numerical]
y = dataTrain['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.head()

In [None]:
X_train.fillna(X_train.median(),inplace=True)
X_test.fillna(X_train.median(),inplace=True)
X_train.head()

In [None]:
equal_frequency_discretizer = EqualFrequencyDiscretiser(q=10, variables=['Age'])

X_train = equal_frequency_discretizer.fit_transform(X_train)
X_test = equal_frequency_discretizer.fit_transform(X_test)
X_train.head()

In [None]:
plt.figure(figsize=(12,9))
sns.countplot(x=X_train['Age'])
plt.title("After Binning ")
plt.show()