# Implentations, Applications, and Tools

In [None]:
% pip install numpy # A library with functions to operate on n- dimensional arrays and matrices.
% pip install scikit-learn #  A highly advanced module for machine learning. It contains a good number of algorithms for classification, regressio, clustering etc.
% pip install matplotlib # This a plotting library that makes use of NumPy to graph a good variety of charts
% pip install pandas # The library that deals with data manipulation
% pip install textblob # read data from cloud, from blobs

# Data preprocessing

## Why to process raw data?

raw data = json files, txt, pdfs, word document etc., tables etc.
1. structured data, unstructured data, semi-structured?
2. feature selection, evaluate the features
3. historical data, enough features
4. ### Missing Data
- remove missing rows
- fill with the median, the max, the minumum the missing rows per column


In [None]:
import numpy as np
import pandas as pd
data = pd.DataFrame([
    [4.,45.,984.],
    [np.NAN, np.NAN, 5.],
    [94., 23., 55.],
])
data # to print the DataFrame

In [None]:
print(data.fillna(2.5))

In [None]:
print(data.fillna(data.mean()))

In [None]:
print(data.fillna(data.max()))

In [None]:
print(data.fillna(data.min()))

# Feature scaling 



In [None]:
data = pd.DataFrame([
    [58.,1.,43.],
    [10., 200., 65.],
    [20., 75., 7.],
])
data # to print the DataFrame

# Min-Max scalar

stand_scalar = preprocessing.StandardScaler().fit(data)
results = stand_scalar.transform(data)
print(results)

In [None]:
# first to install the library

%pip install scikit-learn



In [None]:
from sklearn import preprocessing

data = pd.DataFrame([
    [58.,1.,43.],
    [10., 200., 65.],
    [20., 75., 7.],
])

scaled_values = preprocessing.MinMaxScaler(feature_range = (0,1))
results = scaled_values.fit(data).transform(data)
print(results)

# Standard scalar 
## gaussian distribution

In [None]:
from sklearn import preprocessing

data = pd.DataFrame([
    [58.,1.,43.],
    [10., 200., 65.],
    [20., 75., 7.],
])

stand_scalar = preprocessing.StandardScaler().fit(data)
results = stand_scalar.transform(data)
print(results)

# Binarizing data



In [None]:
from sklearn import preprocessing
import pandas as pd
data = pd.DataFrame([
    [58,1,43],
    [10, 200, 65],
    [20, 75, 7],
])

binary_scalar = preprocessing.Binarizer(threshold=50).fit(data)
results =binary_scalar.transform(data)
print(results)

# Machine Learning 

## three categories 
1. Supervised learning: an algorith is fed a set of inputs and their corresponding outputs. The algorithm then has to figure out what the output will be for an unfamiliar input.
2. Unsupervised learning: without the relationship that exists between a set of inputs and output variables.
3. Reinforcement learning: the computer in this kind of learning dynamically interacts with the environemt in such way as to imporve its performance. 

In [5]:
### Supervised learning for Classification

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X= iris.data # depending on length and width to predict the flower, X is a matrix contains all the features/variables that the model will use as features to estimate the output
y= iris.target # y is a vector and it is the target value
#print(X)  #to keep the code but not run it just use # comment
#print(y)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#how to apply/train RandomForestClassifier
rf = RandomForestClassifier() # to call the model
rf.fit(X_train, y_train) # to fit the model

#how to measure the accuracy of the classifier --> Evaluation --> Testing set 
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")




Accuracy: 1.00


# for any Classifier we need Accuracy >= 70%/ 0.7

In [7]:
### Supervised learning for Regression

# historical data

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

diabetes = load_diabetes()
X= diabetes.data # depending on length and width to predict the flower, X is a matrix contains all the features/variables that the model will use as features to estimate the output
y= diabetes.target # y is a vector and it is the target value
#print(X)  #to keep the code but not run it just use # comment
#print(y)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#how to apply/train LinearRegression
lreg = LinearRegression() # to call the model
rf.fit(X_train, y_train) # to fit the model

#how to measure the error of the regressot --> Evaluation --> Testing set 
y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred) # mean squared absolute error
r2 = r2_score(y_test, y_pred) # r2 score

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {mse:.2f}")




[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990749
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06833155
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286131
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04688253
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452873
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00422151
   0.00306441]]
[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 

# to prepare for the next class
1. find the formulas mean squared absolute error, r2 score, accuracy. 
2. check the formulas and explain how they work in our models.