# Del 5: Strojno učenje: Linearna regresija

## The Linear Regression Model

### Instance Based Learning Vs. Model Based Learning

###  Introduction To The Data

In [None]:
import pandas as pd
data = pd.read_csv('data/AmesHousing.txt', delimiter="\t")

In [None]:
train = data[0:1460]
test = data[1460:]

In [None]:
#train.info()
target = 'SalePrice'

### Simple Linear Regression

In [None]:
import matplotlib.pyplot as plt
import seaborn

fig = plt.figure(figsize=(7,15))

ax1 = fig.add_subplot(3, 1, 1)
ax2 = fig.add_subplot(3, 1, 2)
ax3 = fig.add_subplot(3, 1, 3)

train.plot(x="Garage Area", y="SalePrice", ax=ax1, kind="scatter")
train.plot(x="Gr Liv Area", y="SalePrice", ax=ax2, kind="scatter")
train.plot(x="Overall Cond", y="SalePrice", ax=ax3, kind="scatter")

plt.show()

In [None]:
train[['Garage Area', 'Gr Liv Area', 'Overall Cond', 'SalePrice']].corr()

### Least Squares

**Residual Sum Of Squares**

<p><img src="https://s3.amazonaws.com/dq-content/235/rss.gif"></p>

### Using Scikit-Learn To Train And Predict

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

### Making Predictions

### Multiple Linear Regression

In [None]:
lr = LinearRegression()
cols = ['Overall Cond', 'Gr Liv Area']




## Feature Selection

### Missing Values

In [None]:
import pandas as pd

data = pd.read_csv('data/AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

In [None]:
numerical_train = train.select_dtypes(include=['int', 'float'])

In [None]:
numerical_train = numerical_train.drop(['PID', 'Year Built', 'Year Remod/Add', 'Garage Yr Blt', 'Mo Sold', 'Yr Sold'], axis=1)

In [None]:
null_series = numerical_train.isnull().sum()

In [None]:
full_cols_series = null_series[null_series == 0]

In [None]:
full_cols_series

### Correlating Feature Columns With Target Column

### Correlation Matrix Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### Train And Test Model

In [None]:
lr = LinearRegression()
lr.fit(train[features], train[target])

test_predictions = lr.predict(clean_test[features])

test_mse = mean_squared_error(test_predictions, clean_test[target])

test_rmse = np.sqrt(test_mse)

print(test_rmse)

### Removing Low Variance Features

### Final Model

In [None]:
lr = LinearRegression()
lr.fit(train[features], train[target])

In [None]:
test_predictions = lr.predict(clean_test[features])

test_mse = mean_squared_error(test_predictions, clean_test[target])

test_rmse_2 = np.sqrt(test_mse)

print(test_rmse_2)

## Overfitting

### Introduction

In [None]:
import pandas as pd
columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]
cars = pd.read_table("data/auto-mpg.data", delim_whitespace=True, names=columns)

In [None]:
cars.head()

In [None]:
cars.dtypes

In [None]:
filtered_cars = cars[cars['horsepower'] != '?'].copy()
filtered_cars['horsepower'] = filtered_cars['horsepower'].astype('float')

In [None]:
filtered_cars.info()

### Bias and Variance

### Bias-variance tradeoff

<p><img alt="Imgur" src="http://scott.fortmann-roe.com/docs/docs/BiasVariance/biasvariance.png"></p>


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def train_and_test(cols):
    features = filtered_cars[cols]
    target = filtered_cars["mpg"]
    lr = LinearRegression()
    lr.fit(features, target)
    predictions = lr.predict(features)
    mse = mean_squared_error(filtered_cars["mpg"], predictions)
    variance = np.var(predictions)
    return(mse, variance)

In [None]:
cyl_mse, cyl_var = train_and_test(["cylinders"])
weight_mse, weight_var = train_and_test(["weight"])

In [None]:
cyl_mse, cyl_var

In [None]:
weight_mse, weight_var

### Multivariate models

In [None]:
one_mse, one_var = train_and_test(["cylinders"])
one_mse, one_var

In [None]:
two_mse, two_var = train_and_test(["cylinders", "displacement"])
two_mse, two_var

In [None]:
three_mse, three_var = train_and_test(["cylinders", "displacement", "horsepower"])
three_mse, three_var

In [None]:
four_mse, four_var = train_and_test(["cylinders", "displacement", "horsepower", "weight"])
four_mse, four_var

In [None]:
five_mse, five_var = train_and_test(["cylinders", "displacement", "horsepower", "weight", "acceleration"])
five_mse, five_var

In [None]:
six_mse, six_var = train_and_test(["cylinders", "displacement", "horsepower", "weight", "acceleration", "model year"])
six_mse, six_var

In [None]:
seven_mse, seven_var = train_and_test(["cylinders", "displacement", "horsepower", "weight", "acceleration","model year", "origin"])
seven_mse, seven_var

### Cross validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
def train_and_cross_val(cols):
    features = filtered_cars[cols]
    target = filtered_cars["mpg"]
    
    variance_values = []
    mse_values = []
    
    kf = KFold(n_splits=10, shuffle=True, random_state=3)
    
    for train_index, test_index in kf.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        predictions = lr.predict(X_test)
        
        mse = mean_squared_error(y_test, predictions)
        var = np.var(predictions)

        variance_values.append(var)
        mse_values.append(mse)
   
    avg_mse = np.mean(mse_values)
    avg_var = np.mean(variance_values)
    return(avg_mse, avg_var)

In [None]:
two_mse, two_var = train_and_cross_val(["cylinders", "displacement"])
two_mse, two_var

In [None]:
three_mse, three_var = train_and_cross_val(["cylinders", "displacement", "horsepower"])
three_mse, three_var

In [None]:
four_mse, four_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight"])
four_mse, four_var

In [None]:
five_mse, five_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration"])
five_mse, five_var

In [None]:
six_mse, six_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration", "model year"])
six_mse, six_var

In [None]:
seven_mse, seven_var = train_and_cross_val(["cylinders", "displacement", "horsepower", "weight", "acceleration","model year", "origin"])
seven_mse, seven_var

### Plotting cross-validation error vs. cross-validation variance

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter([2,3,4,5,6,7], [two_mse, three_mse, four_mse, five_mse, six_mse, seven_mse], c='red')
plt.scatter([2,3,4,5,6,7], [two_var, three_var, four_var, five_var, six_var, seven_var], c='blue')
plt.show()

## Processing And Transforming Features


In [None]:
import pandas as pd

data = pd.read_csv('data/AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

train_null_counts = train.isnull().sum()
print(train_null_counts)
df_no_mv = train[train_null_counts[train_null_counts==0].index]

### Categorical Features

In [None]:
print(train['Utilities'].value_counts())

In [None]:
print(train['Street'].value_counts())

In [None]:
print(train['House Style'].value_counts())

In [None]:
text_cols = df_no_mv.select_dtypes(include=['object']).columns
    
for col in text_cols:
    print(col+":", len(train[col].unique()))
    train.loc[:, col] = train.loc[:, col].astype('category')
    

### Dummy Coding

In [None]:
dummy_cols = pd.DataFrame()
for col in text_cols:
    col_dummies = pd.get_dummies(train[col])
    train = pd.concat([train, col_dummies], axis=1)
    del train[col]

In [None]:
train.head()

### Transforming Improper Numerical Features

In [None]:
print(train[['Year Remod/Add', 'Year Built']])

In [None]:
train['years_until_remod'] = train['Year Remod/Add'] - train['Year Built']

### Missing Values

In [None]:
import pandas as pd

data = pd.read_csv('data/AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

train_null_counts = train.isnull().sum()
df_missing_values = train[train_null_counts[(train_null_counts>0) & (train_null_counts<584)].index]

print(df_missing_values.isnull().sum())
print(df_missing_values.dtypes)

### Imputing Missing Values

In [None]:
float_cols = df_missing_values.select_dtypes(include=['float'])



## Vaja: Predicting House Sale Prices

###  Introduction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")

In [None]:
df.head()

In [None]:
#df.info(memory_usage='deep')

In [None]:
def transform_features(df):
    return df

In [None]:
def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

In [None]:
def train_and_test(df):  
    train = df[:1460]
    test = df[1460:]
    

    numeric_train = train.select_dtypes(include=['integer', 'float'])
    numeric_test = test.select_dtypes(include=['integer', 'float'])
    

    
    return rmse

In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

### Feature Engineering

In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")

In [None]:
def transform_features(df):
    # skopiramo podatke v nov dataframe
    df_transformed = df.copy()
    
    # odstranimo vse stolpce ki imajo 5%+ procentov mankajočih vrednosti
    columns_before = df_transformed.shape[1]
    # DODAJ
    
    
    
    print(f'Dropping {columns_before - df_transformed.shape[1]} columns with 5%+ missing values.')
    
    # odstrnimo vse text stolpce, ki vsebujejo manjkajoče vrednosti
    columns_before = df_transformed.shape[1]
    # DODAJ
    
    
    
    
    print(f'Dropping {columns_before - df_transformed.shape[1]} object columns with any missing values.')
    
    # za vse numerične stolpce nadomestimo, manjkajoče vrednosti z najpogostejšo vrednostjo
    # DODAJ
        
        
    print(f'Filling {len(replacement_values_dict)} columns missing vlues with mode value. ')

    
    # preverimo da ni več manjkajočih vrednosti
    if df_transformed.isnull().sum().sum() == 0:
        print('All missing values removed.')
    else:
        raise ValueError('Dataframe is containing missing values.')
    
    # preuredimo nakatere stolpce, da dobimo bolj uporabne informacije
    # DODAJ
    
    
    
    # odstranimo še ostale stolpe, ki jih ne potrebujemo
    df_transformed = df_transformed.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Yr Sold", "Year Built", "Year Remod/Add"], axis=1)
    
    print(f'After transform_features ---> {df_transformed.shape[1]} columns in DF.')
    return df_transformed

In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

### Feature Selection

In [None]:
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [None]:
def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    # odstranimo tiste vrednsoti, ki imajo korelacijo manjšo od 0.4
    columns_before = df.shape[1]
    # DODAJ
   


    print(f'Dropping {columns_before - df.shape[1]} numeric columns with correlation less then {coeff_threshold}. Current: {df.shape[1]}')
        
    # stolpci ki so primerni za pretvorbo v kategorije
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
    
    # izberemo stolpce, ki so še v df da jaih lahko pretvorimo v category 
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)

    # za vsak stolpec izračunamo število edinstvenih vrednosti
    # DODAJ
    
    columns_before = df.shape[1]
    # DODAJ
    print(f'Dropping {columns_before - df.shape[1]} object columns with more then {uniq_threshold} unique values. Current: {df.shape[1]}')
    
    # pretvorimo v category in nato izračunamo dummie vrednosti za te stolpce
    # DODAJ
    
    # DODAJ
    print(f'After select_features ---> {df.shape[1]} columns with types: {df.dtypes.value_counts().to_dict()}')
    return df


In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df, coeff_threshold=0.4, uniq_threshold=10)
rmse = train_and_test(filtered_df)

rmse

### Train And Test

In [None]:
def train_and_test(df, k=0):
    numeric_df = df.select_dtypes(include=['integer', 'float'])
    features = numeric_df.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)
        return rmse
    
    if k == 1:
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        return avg_rmse
    
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)

        avg_rmse = np.mean(rmse_values)
        return avg_rmse

In [None]:
df = pd.read_csv("data/AmesHousing.txt", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df, coeff_threshold=0.4, uniq_threshold=10)
rmse = train_and_test(filtered_df, k=4)

rmse

### Finding best result

In [None]:
result_dict = {}



In [None]:
print(result_dict)

In [None]:
min(result_dict, key=result_dict.get)