## Megatutorial 04: Regression

In [56]:
from pandas import read_csv

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error

## Daten Laden

In [57]:
data = read_csv("../data/bikesharing.csv", index_col=0)

In [58]:
data.info()

# macht er immer als erstes für Überblick über die Daten
# inputs dürfen nur numerisch sein

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    object 
 1   holiday     731 non-null    object 
 2   weekday     731 non-null    object 
 3   workingday  731 non-null    object 
 4   weathersit  731 non-null    object 
 5   temp        731 non-null    float64
 6   atemp       731 non-null    float64
 7   hum         731 non-null    float64
 8   windspeed   731 non-null    float64
 9   casual      731 non-null    int64  
 10  registered  731 non-null    int64  
 11  cnt         731 non-null    int64  
 12  day         731 non-null    int64  
 13  month       731 non-null    int64  
 14  year        731 non-null    int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 91.4+ KB


# Data Preprocessing

## Imputing

In [59]:
imputer = SimpleImputer(strategy="median")
imputer.fit(data[["hum"]])
data["hum"] =imputer.transform(data[["hum"]])

# ich kann sagen wie der SimpleImputer unsere Spalte auffüllen soll, bspw. mit den fehlenden Werten
# 2 eckige Klammern weil es zweidimensional sein muss

In [60]:
data.info()

# jetzt schaue ich nach und sehe dass alle Werte bei humidity belegt sind

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    object 
 1   holiday     731 non-null    object 
 2   weekday     731 non-null    object 
 3   workingday  731 non-null    object 
 4   weathersit  731 non-null    object 
 5   temp        731 non-null    float64
 6   atemp       731 non-null    float64
 7   hum         731 non-null    float64
 8   windspeed   731 non-null    float64
 9   casual      731 non-null    int64  
 10  registered  731 non-null    int64  
 11  cnt         731 non-null    int64  
 12  day         731 non-null    int64  
 13  month       731 non-null    int64  
 14  year        731 non-null    int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 91.4+ KB


## Encoding

In [61]:
# Season
season_encoder = LabelEncoder()
data["season"] = season_encoder.fit_transform(data["season"])

# Holiday
holiday_encoder = LabelEncoder()
data["holiday"] = holiday_encoder.fit_transform(data["holiday"])

# weekday
weekday_encoder = LabelEncoder()
data["weekday"] = weekday_encoder.fit_transform(data["weekday"])

# workingday
workingday_encoder = LabelEncoder()
data["workingday"] = workingday_encoder.fit_transform(data["workingday"])

# weathersit
weathersit_encoder = LabelEncoder()
data["weathersit"] = weathersit_encoder.fit_transform(data["weathersit"])


In [62]:
data.select_dtypes(exclude="number").info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Empty DataFrame


## Feature/Target Split

In [63]:
features = [
    'season', 'holiday', 'weekday', 'weathersit', 'temp',
       'atemp', 'hum', 'windspeed', 'month'
       ]

target = ['cnt']

X = data[features]
y = data[target]

## Train/Test Split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=875)

# hier hat er für die Projektarbeit keine Vorgaben gemacht, können wir selbst entscheiden

# Modeling

## Lineare Regression

In [67]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

predictions = linear_model.predict(X_test)

print(
    "R2", r2_score(y_test, predictions),
    ", RMSE", root_mean_squared_error(y_test, predictions),
    ", MAE", mean_absolute_error(y_test, predictions)
)

R2 0.5391082444141173 , RMSE 1375.3441961742901 , MAE 1163.164204338155


### Decision Tree Regressor

In [69]:
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_train, y_train)

predictions = tree_model.predict(X_test)

print(
    "R2", r2_score(y_test, predictions),
    ", RMSE", root_mean_squared_error(y_test, predictions),
    ", MAE", mean_absolute_error(y_test, predictions)
)

R2 0.5578119411275271 , RMSE 1347.1483842208493 , MAE 1130.90640950681
