# Main Notebook

## Libraries

In [1]:
import math

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
import datetime
def timestamp():
    time = f"{datetime.datetime.now().date()}--{datetime.datetime.now().time().replace(microsecond=0)}"
    return time.replace(":", "-")

## Load data and describe

In [3]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  object 
 2   cat1    300000 non-null  object 
 3   cat2    300000 non-null  object 
 4   cat3    300000 non-null  object 
 5   cat4    300000 non-null  object 
 6   cat5    300000 non-null  object 
 7   cat6    300000 non-null  object 
 8   cat7    300000 non-null  object 
 9   cat8    300000 non-null  object 
 10  cat9    300000 non-null  object 
 11  cont0   300000 non-null  float64
 12  cont1   300000 non-null  float64
 13  cont2   300000 non-null  float64
 14  cont3   300000 non-null  float64
 15  cont4   300000 non-null  float64
 16  cont5   300000 non-null  float64
 17  cont6   300000 non-null  float64
 18  cont7   300000 non-null  float64
 19  cont8   300000 non-null  float64
 20  cont9   300000 non-null  float64
 21  cont10  30

In [6]:
# drop id
df = df.drop(columns=["id"])

In [7]:
# no missing values
df.isnull().sum()

cat0      0
cat1      0
cat2      0
cat3      0
cat4      0
cat5      0
cat6      0
cat7      0
cat8      0
cat9      0
cont0     0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
target    0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,0.524634,0.506649,0.444115,0.446214,0.455471,0.508337,0.478345,0.455904,0.459321,0.526899,0.504943,0.529938,0.524549,0.503349,7.45626
std,0.204875,0.235269,0.200089,0.238669,0.200695,0.231612,0.192432,0.204493,0.220642,0.204025,0.201549,0.23086,0.220892,0.225218,0.887295
min,-0.093505,-0.055105,-0.060274,0.13476,0.189216,-0.087247,0.043953,0.208703,0.004041,0.07304,0.059644,0.064161,-0.0056,0.158121,0.0
25%,0.370451,0.352307,0.314121,0.214572,0.279853,0.338747,0.339896,0.278041,0.308655,0.361957,0.338898,0.316662,0.332143,0.291289,6.798341
50%,0.492208,0.615156,0.457271,0.377823,0.411351,0.441384,0.41009,0.360736,0.425801,0.488867,0.519855,0.558827,0.407365,0.433909,7.496503
75%,0.654793,0.68815,0.554835,0.719758,0.621808,0.709515,0.604246,0.639388,0.541525,0.752765,0.672809,0.720381,0.732431,0.73087,8.161166
max,1.052666,0.851746,1.017689,1.006469,0.99405,1.044433,1.093312,1.036541,1.014156,0.972091,1.029773,1.038049,0.96137,0.873579,10.309208


## Visualize distributions and relationships

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.histplot(df, x=i)
    plt.show()

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.jointplot(x=df[i], y=df["target"], kind="hex")
    plt.show()

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
    sns.histplot(df, x=i, ax=ax[0])
    sns.boxplot(data=df, x=i, y="target", ax=ax[1])
    plt.show()

## Feature engineering

In [None]:
standardizer.fit(features_train)
features_train_std = standardizer.transform(features_train)
features_test_std = standardizer.transform(features_test)

## Prepare for models

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV

In [26]:
numerical_features = df.drop(columns=["target"]).select_dtypes(include="float64").columns
numerical_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_features = df.select_dtypes(include="object").columns
categorial_transformer = OneHotEncoder(drop="first") # need drop="first"

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorial_transformer, categorical_features)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="target"),
    df["target"],
    test_size=0.3,
    random_state=1
)

## Training

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [28]:
regression = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", LinearRegression())]
)

regression.fit(X_train, y_train)

In [29]:
y_train_pred = regression.predict(X_train)
y_test_pred = regression.predict(X_test)

print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

Training error: 0.8634031166242092
Test error: 0.8639449464826138


## Submission

In [37]:
submission_id = df_test["id"]
df_test = df_test.drop(columns="id")

In [32]:
submission_y = regression.predict(df_test)

In [40]:
submission = pd.DataFrame()
submission["id"] = submission_id
submission["target"] = submission_y

In [41]:
submission

Unnamed: 0,id,target
0,0,7.559411
1,5,7.869817
2,15,7.637395
3,16,7.520976
4,17,7.330620
...,...,...
199995,499987,7.347568
199996,499990,7.406191
199997,499991,7.545454
199998,499994,7.472477


In [49]:
submission.to_csv(f"../submissions/submission_{timestamp()}.csv", index=False)