In [None]:
import random
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Step 1 - Modeling

In [None]:
# Import data
df_train = pd.read_csv('scaled_train.csv', sep=';')
df_test = pd.read_csv('scaled_test.csv', sep=';')

In [None]:
# Use standard regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets by predicting the next days precipitation
regr.fit(df_train.drop('prec', axis=1), df_train['prec'])

# Make predictions using the testing set
y_pred = regr.predict(df_test.drop('prec', axis=1))

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)

# The root mean squared error (RMSE), parameter squared=False gives the RMSE
print("Root mean squared error: %.2f"
      % mean_squared_error(df_test['prec'], y_pred, squared=False))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(df_test['prec'], y_pred))

# Plot predictions in time series plot compared to actual values
plt.figure(figsize=(20, 10))
plt.plot(df_test['prec'], label='Actual')
plt.plot(y_pred, label='Predicted')

# Only plot the first year
plt.xlim(0, 365)

plt.legend()

# Step 2 - Feature importance

In [None]:
from sklearn.inspection import permutation_importance


perm_importance = permutation_importance(regr, df_test.drop('prec', axis=1), df_test['prec'], n_repeats=10, random_state=42)

# Sort the indices of the permutation importance values
sorted_idx = perm_importance.importances_mean.argsort()

# Plot the permutation importance values
plt.figure(figsize=(20, 10))
plt.barh(df_test.drop('prec', axis=1).columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

# Add the column names to the plot
plt.yticks(ticks=range(len(df_test.drop('prec', axis=1).columns)), labels=df_test.drop('prec', axis=1).columns[sorted_idx])
plt.show()


In [None]:
# Get feature importance
importance = regr.coef_

# Summarize feature importance
for i, v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i, v))

# Plot feature importance by its index name
plt.figure(figsize=(20, 10))
plt.bar([x for x in range(len(importance))], importance)
plt.xticks([x for x in range(len(importance))], df_train.drop('prec', axis=1).columns,  rotation='vertical')
plt.show()

# Export analysing data 

In [None]:
# Create empty dataframe
df = pd.DataFrame()

# Add column names
df['feature_names'] = df_train.drop('prec', axis=1).columns[sorted_idx]

# Add feature names and importance to dataframe
df['feature_importance_values'] = perm_importance.importances_mean[sorted_idx]

# Add priority of column names
df['original_order'] = perm_importance.importances_mean.argsort()

# Add root mean squared error
df['rmse'] = mean_squared_error(df_test['prec'], y_pred, squared=False)

df.head()

### Export to csv file

In [None]:
df.to_csv('regression_model_feature_importance.csv', sep=';', index=False)
