<a href="https://colab.research.google.com/github/istvanbaksa/Data-Science-1/blob/main/Hands_On_Data_Analysis_with_Pandas_%E2%80%93_Second_Edition_Chapter_9_Exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Hands-On Data Analysis with Pandas – Second Edition Chapter 9 Exercises

In [None]:
'''
1. Build a clustering model to distinguish between red and white wine by their
chemical properties:
a) Combine the red and white wine datasets (data/winequality-red.csv
and data/winequality-white.csv, respectively) and add a column for the
kind of wine (red or white).
b) Perform some initial EDA.
c) Build and fit a pipeline that scales the data and then uses k-means clustering to
make two clusters. Be sure not to use the quality column.
d) Use the Fowlkes-Mallows Index (the fowlkes_mallows_score() function
is in sklearn.metrics) to evaluate how well k-means is able to make the
distinction between red and white wine.
e) Find the center of each cluster.
'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import fowlkes_mallows_score
from sklearn.neighbors import NearestCentroid

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

winew = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-white.csv', delimiter = ';')
winer = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-red.csv')

winew['kind'] = 'white'      #Adding column for kind
winer['kind'] = 'red'

frames = [winew, winer]
wine = pd.concat(frames)     #Combining datasets

wine['kind'] = wine['kind'].replace(to_replace = ['white', 'red'], value = [0, 1])      #Changing kind to numeric

X = wine.drop(['kind', 'quality'], axis = 1)
y = wine['kind']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)        #Splitting up data to training and test set

sc = StandardScaler()

X_train = sc.fit_transform(X_train)                           #Scaling data
X_test = sc.fit_transform(X_test)

kmeans = KMeans(n_clusters=2, random_state=0)                 #Introducing KMeans and defining its parameters

kmeans.fit(X_train, y_train)                                  #Training model

kmeans_pred = kmeans.predict(X_test)                          #Testing predictive power on the test set

print(classification_report(y_test, kmeans_pred))             #Classification report

print(fowlkes_mallows_score(y_test, kmeans_pred))             #The Fowlkes-Mallows index (geometric mean between of the precision and recall)

clf = NearestCentroid()
clf.fit(X, y)

print(clf.centroids_)                                         #Centroids



In [None]:
'''
1. Build a clustering model to distinguish between red and white wine by their
chemical properties:
a) Combine the red and white wine datasets (data/winequality-red.csv
and data/winequality-white.csv, respectively) and add a column for the
kind of wine (red or white).
b) Perform some initial EDA.
c) Build and fit a pipeline that scales the data and then uses k-means clustering to
make two clusters. Be sure not to use the quality column.
d) Use the Fowlkes-Mallows Index (the fowlkes_mallows_score() function
is in sklearn.metrics) to evaluate how well k-means is able to make the
distinction between red and white wine.
e) Find the center of each cluster.
'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import fowlkes_mallows_score
from sklearn.neighbors import NearestCentroid

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

winew = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-white.csv', delimiter = ';')
winer = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-red.csv')

winew['kind'] = 'white'      #Adding column for kind
winer['kind'] = 'red'

frames = [winew, winer]
wine = pd.concat(frames)     #Combining datasets

wine['kind'] = wine['kind'].replace(to_replace = ['white', 'red'], value = [0, 1])      #Changing kind to numeric

X = wine.drop(['kind', 'quality'], axis = 1)
y = wine['kind']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)        #Splitting up data to training and test set

pipeline_kmeans = Pipeline([                                                        #Setting up pipeline
                            ('Scaling', StandardScaler()),                          #Scaling (StandardScaler)
                            ('kmeans', KMeans(n_clusters=2, random_state=0))        #Model: KMeans
                            ]).fit(X_train)                                         #Training model


kmeans_pred = pipeline_kmeans.predict(X_test)               #Testing predictive power on the test set

print(classification_report(y_test, kmeans_pred))           #Classification report

print(fowlkes_mallows_score(y_test, kmeans_pred))           #The Fowlkes-Mallows index (geometric mean between of the precision and recall)

clf = NearestCentroid()
clf.fit(X, y)

print(clf.centroids_)                                       #Centroids


In [None]:
'''
2. Predict star temperature:
a) Using the data/stars.csv file, perform some initial EDA and then build
a linear regression model of all the numeric columns to predict the temperature
of the star.
b) Train the model on 75% of the initial data.
c) Calculate the R2 and RMSE of the model.
d) Find the coefficients for each regressor and the intercept of the linear regression
equation.
e) Visualize the residuals using the plot_residuals() function from the
ml_utils.regression module.
'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from yellowbrick.regressor import ResidualsPlot

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

stars = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/stars.csv')


numeric = stars.drop(['spectraltype', 'name', 'magH'], axis = 1)      #Dropping non-numeric columns
numeric = numeric.dropna()                                            #Dropping rows with missing values

X = numeric.drop('temperature', axis = 1)                             #Data columns
y = numeric.temperature                                               #Target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)        #Splitting up data to training and test set


lr = LinearRegression().fit(X_train, y_train)                                 #Training model

print('R-squared: ', lr.score(X_test, y_test))                                #R-squared

print('RMSE: ', np.sqrt(mean_squared_error(y_test, lr.predict(X_test))))      #RMSE

print('Coefficients of the linear model: ', [(coef, feature) for coef, feature in zip(lr.coef_, X_train.columns)])      #Coeffcients of the linear model

print('Y-intercept of the linear model: ', lr.intercept_)       #Y-intercept of the linear model

model = LinearRegression()
visualizer = ResidualsPlot(model, hist=False)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Finalize and render the figure

resid = (y_test - lr.predict(X_test))
sns.kdeplot(resid)

In [None]:
'''
3. Classify planets that have shorter years than Earth:
a) Using the data/planets.csv file, build a logistic regression model with the
eccentricity, semimajoraxis, and mass columns as regressors. You will
need to make a new column to use for the y (year shorter than Earth).
b) Find the accuracy score.
c) Use the classification_report() function from scikit-learn to see
the precision, recall, and F1 score for each class.
d) With the plot_roc() function from the ml_utils.classification
module, plot the ROC curve.
e) Create a confusion matrix using the confusion_matrix_visual() function
from the ml_utils.classification module.
'''
#!pip install scikit-plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import scikitplot as skplt

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

planets = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/planets.csv')


planets['y'] = np.where(planets['period'] < 365.2422, True, False)              # Creating year classification column

planets2 = planets[['eccentricity', 'semimajoraxis', 'mass', 'y']].dropna()     # Dropping rows with missing values

X = planets2.drop('y', axis = 1)     #Data set
y = planets2.y                       #Target set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify = y)        #Splitting up data to training and test set

logreg = LogisticRegression(random_state = 0).fit(X_train, y_train)             #Training model

display(accuracy_score(y_test, logreg.predict(X_test)))                         # Accuracy score
print(classification_report(y_test, logreg.predict(X_test)))                    # Classification report

y_true = y_test                                           # Plotting ROC curve
y_probas = logreg.predict_proba(X_test)
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()

print(confusion_matrix(y_test, logreg.predict(X_test)))   # Confusion matrix

In [None]:
'''
4. Multiclass classification of white wine quality:
a) Using the data/winequality-white.csv file, perform some initial EDA on
the white wine data. Be sure to look at how many wines had a given quality score.
b) Build a pipeline to standardize the data and fit a multiclass logistic regression
model. Pass multi_class='multinomial' and max_iter=1000 to the
LogisticRegression constructor.
c) Look at the classification report for your model.
d) Create a confusion matrix using the confusion_matrix_visual() function
from the ml_utils.classification module. This will work as is for
multiclass classification problems.
e) Extend the plot_roc() function to work for multiple class labels. To do so,
you will need to create a ROC curve for each class label (which are quality scores
here), where a true positive is correctly predicting that quality score and a false
positive is predicting any other quality score. Note that ml_utils has a function
for this, but try to build your own implementation.
f) Extend the plot_pr_curve() function to work for multiple class labels
by following a similar method to part e). However, give each class its own
subplot. Note that ml_utils has a function for this, but try to build your own
implementation.
'''
#!pip install scikit-plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import fowlkes_mallows_score
from sklearn.linear_model import LogisticRegression
import scikitplot as skplt


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

wine = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-white.csv', delimiter = ';')

print('Wine quality counts:\n',wine.quality.value_counts())                 # Checking quality scores
print('Number of rows with missing values: ', wine.isnull().sum().sum())     # Checking missing values               

X = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',     # Data set
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]

y = wine['quality']                                                                 # Target set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify = y)          # Splitting data up into train and test set

wine_pipeline = Pipeline([('Scaler', StandardScaler()),                                                              # Pipeline: Scaling
                          ('Model', LogisticRegression(multi_class='multinomial', max_iter=1000, random_state = 0))  #           Model: Logistic regression
                          ]).fit(X_train, y_train)                                                                   #           Training model with X_train

print(classification_report(y_test, wine_pipeline.predict(X_test)))       # Classification report

print(confusion_matrix(y_test, wine_pipeline.predict(X_test)))            # Confusion matrix

y_true = y_test                                                           # Plotting ROC curve
y_probas = wine_pipeline.predict_proba(X_test)
skplt.metrics.plot_roc_curve(y_true, y_probas, figsize = (12,12))

plt.show()


In [None]:
'''
5. We have seen how easy the scikit-learn API is to navigate, making it a cinch
to change which algorithm we are using for our model. Rebuild the red wine quality
model that we created in this chapter using an SVM instead of logistic regression.
We haven't discussed this model, but you should still be able to use it in scikitlearn.
Check out the link in the Further reading section to learn more about the
algorithm. Some guidance for this exercise is as follows:
a) You will need to use the SVC (support vector classifier) class from scikitlearn,
which can be found at https://scikit-learn.org/stable/
modules/generated/sklearn.svm.SVC.html.
b) Use C=5 as an argument to the SVC constructor.
c) Pass probability=True to the SVC constructor to be able to use the
predict_proba() method.
d) Build a pipeline first using the StandardScaler class and then the SVC class.
e) Be sure to look at the classification report, precision-recall curve, and confusion
matrix for the model.

'''

#!pip install scikit-plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import fowlkes_mallows_score
from sklearn.svm import SVC
import scikitplot as skplt

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

wine = pd.read_csv('https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_09/data/winequality-red.csv')

wine['quality2'] = pd.cut(wine.quality, bins=[0, 6, 10], labels=[0, 1])

X = wine.drop(['quality', 'quality2'], axis = 1)
y = wine.quality2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0, stratify = y)

wine_pipeline = Pipeline([
                          ('Scaler', StandardScaler()),
                          ('SVM', SVC(C = 5, random_state = 0, probability = True))
                          ]).fit(X_train, y_train)

print(classification_report(y_test, wine_pipeline.predict(X_test)))       # Classification report
print(confusion_matrix(y_test, wine_pipeline.predict(X_test)))            # Confusion matrix

y_pred = wine_pipeline.predict(X_test)                                    # Confusion matrix plot
skplt.metrics.plot_confusion_matrix(y_test, y_pred)

plt.show()


y_probas = wine_pipeline.predict_proba(X_test)                            # Precision-recall curve
skplt.metrics.plot_precision_recall(y_test, y_probas)

plt.show()