# Preparations and Data Wrangling

In [None]:
# load modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from time import sleep
from IPython.display import clear_output

In [None]:
#Define wind force class names
wind_force_class_names = {0:"No wind",1:"Weak wind force",2:"Middle wind force",3:"Strong wind force"}

In [None]:
# fun to crop img
def fn_crop_image(img_array, y_start, x_start):
    y_heigth = 60 #ursprünglich 80
    x_width = 110
    
    crop_image = img_array[y_start:y_start+y_heigth,x_start:x_start+x_width] # set RGB to 0 to only analyze red chanel
    return crop_image

# fun to resize img
# to do if further dimension reduction is desired
    

In [None]:
# mount google drive
from google.colab import drive
drive.mount("/content/drive")  # mount google drive

In [None]:
# define data directory
DATADIR = "../data/Originals/"

# change: import labels
label_df = pd.read_csv("../data/labels_old_camera.csv")

label_df = label_df.rename(columns={"Unnamed: 0": "img"})

# retain only labels / img with meaningful label
label_df = label_df[label_df["wind_force"] != "0"]

# array with img names of labeled img
labeled_img = label_df["img"].to_numpy()

# create numeric wind force variable
label_df["wind_force_num"] = np.where(label_df["wind_force"] == "n", 0,
                             np.where(label_df["wind_force"] == "w", 1,
                             np.where(label_df["wind_force"] == "m", 2,
                             np.where(label_df["wind_force"] == "s", 3, -1))))


In [None]:
training_data = []

def create_training_data():
    for img in labeled_img:
        path = os.path.join(DATADIR, img)
        img_array = cv2.imread(os.path.join(path))
        img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) # from BGR to RGB
        new_array = fn_crop_image(img_array, 250, 570)
        wind_label = label_df.loc[label_df["img"] == img, "wind_force_num"] # get label to coresponding img
        wind_label = np.ndarray.item(wind_label.to_numpy()) # convert to single scalar integer
        training_data.append([new_array, wind_label])
    
create_training_data()

In [None]:
#Aufbau Liste (nur zum Verständnis)
#training_data[0] --> Liste (Bildinfos plus Label fürs Erste Bild)
#training_data[0][0] --> numpy.ndarray für Bildinfos erstes Bild
#traninng_data[0][0].shape --> (80, 110, 3) (Shape des Bildes vgl. fn_crop_image)
#training_data[0][1] --> int für Windstärke Label

print(f"type :{training_data[0][0].shape}")
print(len(training_data))

In [None]:
images = []
labels = []

for feature, label in training_data:
    images.append(feature)
    labels.append(label)

# mutate to np.array
np_images = np.array(images).reshape(-1, np.array(images).shape[1], np.array(images).shape[2], np.array(images).shape[3])
np_labels = np.array(labels)


In [None]:
# Notes

# X_train=train_images[:5000,:].reshape(5000,-1)
# y_train=train_labels[:5000]
# X_test=test_images[:1000,:].reshape(1000,-1)
# y_test=test_labels[:1000]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
#Show images in labeled_img:
sleep_seconds = 5
for image_name in labeled_img:
  fig = plt.figure(1,figsize= (40,15))
  chart0 = fig.add_subplot(121)
  chart1 = fig.add_subplot(122)
  img_array = cv2.imread(os.path.join(DATADIR+image_name))
  img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) # from BGR to RGB
  #ursprünglich: new_array = fn_crop_image(img_array,230,570)
  new_array = fn_crop_image(img_array,250,570)
  new_array = new_array[:,:,0]
  print(image_name)
  chart0.imshow(img_array)
  chart1.imshow(new_array, cmap = 'gray')
  plt.show()
  sleep(sleep_seconds)
  clear_output()

In [None]:
np_images.shape

# Try ML models

##Preparations

In [None]:
# Scikit-learn (formerly scikits.learn and also known as sklearn) is a free 
# software machine learning library for the Python programming language. 
# It features various classification, regression and clustering algorithms, 
# and is designed to interoperate with the Python numerical and scientific 
# libraries NumPy and SciPy. (from wiki)

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.inspection import permutation_importance

# common visualization module
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set()

# numeric library
import numpy as np

import os
import pandas as pd
from time import time as timer
import tarfile

import tensorflow as tf

%matplotlib inline
from matplotlib import animation
from IPython.display import HTML

In [None]:
#Convert images to Shape (268, 19'800 (60x110x3))
print(f"np_images shape: {np_images.shape}")
np_images_red = np_images[:,:,:,0] #reduce to red channel
print(f"np_images_red shape: {np_images_red.shape}")

np_images_red_flatten = np_images_red[:268,:].reshape(268,-1)
print(f"np_images_red_flatten shape: {np_images_red_flatten.shape}")

#1) Split into train and test set:
x_train, x_test, y_train, y_test = train_test_split(np_images_red_flatten, np_labels, test_size=0.2)

In [None]:
np_images_red_flatten.shape

##Linear Model - Linear Regression

In [None]:
# 2. fit the model
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)

In [None]:
# 3. evaluate MSE, MAD, and R2 on train and test datasets
# Mean square error
print(f"MSE train data: {np.std(y_train - reg.predict(x_train))}")
print(f"MSE test data: {np.std(y_test - reg.predict(x_test))}")
# Mean absolute error
print(np.mean(np.abs(y_test - reg.predict(x_test))))
from sklearn.metrics import mean_absolute_error
print(f"MAE train data: {mean_absolute_error(y_train,reg.predict(x_train))}")
print(f"MAE test data: {mean_absolute_error(y_test,reg.predict(x_test))}")
# R2

## Linear Model - Logistic Regression

In [None]:
#1 Create classifier
multi_class='multinomial'
clf = linear_model.LogisticRegression(solver='sag', max_iter=100, multi_class='multinomial', )
# fit the model
clf.fit(x_train, y_train)
#print(x_train.shape) --> 

#Evaluate accuracy on train and test datasets
print("training accuracy : %.3f (%s)" % (clf.score(x_train, y_train), multi_class))
print("Test accuracy : %.3f (%s)" % (clf.score(x_test, y_test), multi_class))

In [None]:
y_test_pred = clf.predict(x_test)

In [None]:
idx = 29
plt.grid(False)
plt.imshow(x_test[idx].reshape(60,110),cmap='gray')
c_true = y_test[idx]
c_pred = y_test_pred[idx]
print('Label:', wind_force_class_names[c_true])
print('Prediction:', wind_force_class_names[c_pred])
if c_true == c_pred:
  print('Hurray! :-)')