In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RepeatedKFold, cross_val_score, train_test_split


warnings.filterwarnings('ignore')

## Get Training Data

In [10]:
# Get training data
train = pd.DataFrame(pd.read_csv('training.csv'))
train['row'] = np.arange(len(train))
# critical_temp should be at end for ease of use
train = train.iloc[:, np.r_[82,0:81,81]]

# Get formula training data
form_train = pd.DataFrame(pd.read_csv('formula_training.csv'))
form_train['row'] = np.arange(len(form_train))
# Move new 'row' column to beginning
form_train = form_train.iloc[:, np.r_[87,0:87]]

# Create combined training table
train_full = pd.merge(train, form_train, how="inner", on="row")

# Rearrange columns and drop 'material'
train_full = train_full.iloc[:, np.r_[1:82, 83:169, 82]]

# Separate into independent and dependent variables
X = train_full.iloc[:,0:167]
Y = pd.DataFrame(train_full.iloc[:,167])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Get Test Data

In [11]:
test = pd.DataFrame(pd.read_csv('test.csv'))
test['row'] = np.arange(len(test))
# critical_temp should be at end for ease of use
test = test.iloc[:, np.r_[81,0:81]]

# Get formula training data
form_test = pd.DataFrame(pd.read_csv('formula_test.csv'))
form_test['row'] = np.arange(len(form_test))

# Move new 'row' column to beginning
# Drop 'material'
form_test = form_train.iloc[:, np.r_[0:87]]

# Create combined training table
test_full = pd.merge(test, form_test, how="inner", on="row")
test_full = test_full.iloc[:, np.r_[1:168]]

## Split training data into training and validation sets

In [12]:
# Split the data using train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2)

# Print the shape of the data for verification
print(f'X_train shape: {X_train.shape}\nY_train shape: {Y_train.shape}')
print(f'X_train shape: {X_test.shape}\nY_train shape: {Y_test.shape}')

X_train shape: (16210, 167)
Y_train shape: (16210, 1)
X_train shape: (4053, 167)
Y_train shape: (4053, 1)


## Apply multiple linear regression model to training dataset

In [16]:
# Apply multiple linear regression model
mlm = LinearRegression()

# Fit model
mlm.fit(X_train, Y_train)

# Make predictions on validation set
Y_pred = mlm.predict(X_test)

# Compute RMSE
mean_squared_error(Y_test, Y_pred, squared = False)

16.831029781018803

## Ridge Regression ##

Ridge performs about the same as mlm.

In [17]:
# Create array of alphas to test (recommended range from 0 to 1 at 0.01 intervals)
alphas = np.arange(0, 1, 0.01)

# Set folds
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Fit model to find best alpha
ridgeR = RidgeCV(alphas = alphas, cv = cv)
ridgeR.fit(X_train, Y_train)

# Find best alpha
alpha = ridgeR.alpha_

# Fit new model with best alpha
best_ridge = Ridge(alpha = alpha)
best_ridge.fit(X_train, Y_train)

# Make predictions on validation set
Y_pred = best_ridge.predict(X_test)

# Compute RMSE
mean_squared_error(Y_test, Y_pred, squared = False)

16.82352523337167

## Lasso ##

Ridge regression does not give a noticeable improvement over multiple linear regression, so test Lasso.

In [18]:
# Create array of alphas
alphas = np.logspace(start = -5, stop = 5, num=100, endpoint = True, base = 10)

# Set folds
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)

# Build model
lasso = LassoCV(alphas = alphas, cv = cv, n_jobs = -1)

# Fit model
lasso.fit(X_train, Y_train)

# Get best alpha/lambda
alpha = lasso.alpha_

# Fit model with best alpha/lambda
best_lasso = Lasso(alpha = alpha)
best_lasso.fit(X_train, Y_train)

# Make predictions on validation set
Y_pred = best_lasso.predict(X_test)

# Compute RMSE
mean_squared_error(Y_test, Y_pred, squared = False)

17.806986921925713

## Generate critical_temp predictions using Lasso ##

Using Ridge with the best possible tuning parameter computed the lowest RMSE of the three models used, so that is used to make predictions on test data.

In [43]:
# Make predictions on test data
test_full = scaler.fit_transform(test_full)
yhat_ridge = best_ridge.predict(test_full).reshape(1,1000)[0]
yhat_lasso = best_lasso.predict(test_full)

# Summarize prediction
yhat_ridge = pd.Series(yhat)
print(yhat_ridge)

# Write predictions to csv
rows = np.arange(0,1000,1)
temp_predictions = pd.DataFrame(yhat_ridge,rows)
temp_predictions.to_csv('predictions.csv')

0       49.013794
1        7.025564
2      103.300094
3        1.751054
4        4.887319
          ...    
995     37.314312
996    133.647344
997     29.065645
998     33.431576
999      5.630801
Length: 1000, dtype: float64
