In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from io import StringIO

In [2]:
# Install prettytable library
!pip install prettytable

Collecting prettytable
  Downloading prettytable-3.10.0-py3-none-any.whl.metadata (30 kB)
Downloading prettytable-3.10.0-py3-none-any.whl (28 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.10.0


In [2]:
import boto3
# Establish connection to AWS S3
s3_client = boto3.client('s3',region_name='eu-west-3')
bucket_name = 'p4-haritha'
data_key = 'p4_haritha_wine_input_data/winequality-white.csv'

In [3]:
# Retrieve dataset from S3 bucket
response = s3_client.get_object(Bucket=bucket_name, Key=data_key)
data_content = response['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data_content), sep=';')

In [5]:
#df=pd.read_csv('winequality-white.csv', delimiter=";")

In [4]:
# Function to categorize wine quality
def categorize_quality(quality):
    if 0 <= quality <= 4:
        return "Low Quality"
    elif 5 <= quality <= 7:
        return "Average Quality"
    elif 8 <= quality <= 10:
        return "High Quality"

In [5]:
# Assume all features are continuous except the target 'quality'
continuous_features = df.columns.difference(['quality']).tolist()

# Transform 'quality' into categorical bins
df['quality'] = df['quality'].apply(categorize_quality)
#print(df.head())
# Initialize LabelEncoder and encode the quality labels
label_encoder = LabelEncoder()
df['quality_encoded']=label_encoder.fit_transform(df['quality'])
# Define a pipeline for transforming the data
pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), continuous_features)  # Apply standardization
    # Add other transformers here if needed
])

In [6]:
# Split data into features and target
X = df.drop('quality', axis=1)
y = df['quality_encoded']

In [7]:
# Split data into train, validate, test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Fit the pipeline on the training data
X_train_transformed = pipeline.fit_transform(X_train)
X_validate_transformed = pipeline.transform(X_validate)
X_test_transformed = pipeline.transform(X_test)

# Convert transformed arrays back to DataFrame
X_train_transformed = pd.DataFrame(X_train_transformed, columns=continuous_features)
X_validate_transformed = pd.DataFrame(X_validate_transformed, columns=continuous_features)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=continuous_features)

In [8]:

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)


# Train the models
rf_model.fit(X_train_transformed, y_train)


# Predictions
rf_predictions = rf_model.predict(X_test_transformed)


# Evaluate the models
def evaluate_model(predictions, y_test):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='macro')
    recall = recall_score(y_test, predictions, average='macro')
    return accuracy, precision, recall




rf_accuracy, rf_precision, rf_recall = evaluate_model(rf_predictions, y_test)


print("Random Forest Metrics for Test Data:")
print("Accuracy: {:.2f}".format(rf_accuracy))
print("Precision: {:.2f}".format(rf_precision))
print("Recall: {:.2f}".format(rf_recall))


validate_prediction_rf=rf_model.predict(X_validate_transformed)
v_rf_accuracy, v_rf_precision, v_rf_recall = evaluate_model(validate_prediction_rf, y_validate)
print("Random Forest Metrics for validate data:")
print("Accuracy: {:.2f}".format(v_rf_accuracy))
print("Precision: {:.2f}".format(v_rf_precision))
print("Recall: {:.2f}".format(v_rf_recall))

Random Forest Metrics for Test Data:
Accuracy: 0.96
Precision: 0.83
Recall: 0.63
Random Forest Metrics for validate data:
Accuracy: 0.95
Precision: 0.93
Recall: 0.51


In [9]:
from sklearn.tree import DecisionTreeClassifier


# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_transformed, y_train)
dt_pred_test = dt_classifier.predict(X_test_transformed)


# Evaluation metrics for Decision Tree
test_acc_dt = round(accuracy_score(y_test, dt_pred_test), 2)
test_prec_dt = round(precision_score(y_test, dt_pred_test,average ='macro'), 2)
test_rec_dt = round(recall_score(y_test, dt_pred_test,average ='macro'), 2)
val_pred_dt = dt_classifier.predict(X_validate_transformed)
val_acc_dt = round(accuracy_score(y_validate, val_pred_dt), 2)
val_prec_dt = round(precision_score(y_validate, val_pred_dt,average ='macro'), 2)
val_rec_dt = round(recall_score(y_validate, val_pred_dt,average ='macro'), 2)


print("Decision Tree Classifier Metrics for Test data:")

print("Accuracy: {:.2f}".format(test_acc_dt))
print("Precision: {:.2f}".format(test_prec_dt))
print("Recall: {:.2f}".format(test_rec_dt))


print("Decision Tree Classifier Metrics for validate data:")
print("Accuracy: {:.2f}".format(val_acc_dt))
print("Precision: {:.2f}".format(val_prec_dt))
print("Recall: {:.2f}".format(val_rec_dt))



Decision Tree Classifier Metrics for Test data:
Accuracy: 0.92
Precision: 0.58
Recall: 0.66
Decision Tree Classifier Metrics for validate data:
Accuracy: 0.90
Precision: 0.54
Recall: 0.60


In [10]:
# Mapping dictionary
quality_mapping = {0: 'Average Quality', 1: 'High Quality', 2: 'Low Quality'}

# Apply mapping to the array
validate_prediction_mapped_rf= [quality_mapping[prediction] for prediction in validate_prediction_rf]

# Create a DataFrame
df_prediction_rf = pd.DataFrame(validate_prediction_mapped_rf, columns=['Predicted Quality'])

# Save as CSV
df_prediction_rf.to_csv('predicted_quality_rf.csv', index=False)

In [11]:
# Apply mapping to the array
validate_prediction_mapped_dt= [quality_mapping[prediction] for prediction in val_pred_dt]

# Create a DataFrame
df_prediction_dt = pd.DataFrame(validate_prediction_mapped_dt, columns=['Predicted Quality'])

# Save as CSV
df_prediction_dt.to_csv('predicted_quality_dt.csv', index=False)

In [12]:
from prettytable import PrettyTable
import numpy as np

rf_table = PrettyTable()
rf_table.field_names = ["Performance Metric", "Testing Set", "Validation Set"]
rf_table.add_row(["Accuracy", np.round(rf_accuracy,2), np.round(v_rf_accuracy,2)])
rf_table.add_row(["Precision", np.round(rf_precision,2), np.round(v_rf_precision,2)])
rf_table.add_row(["Recall", np.round(rf_recall,2), np.round(v_rf_recall,2)])


dt_metrics_table = PrettyTable()
dt_metrics_table.field_names = ["Performance Metric", "Testing Set", "Validation Set"]
dt_metrics_table.add_row(["Accuracy", test_acc_dt, val_acc_dt])
dt_metrics_table.add_row(["Precision", test_prec_dt, val_prec_dt])
dt_metrics_table.add_row(["Recall", test_rec_dt, val_rec_dt])



print("Performance Metric for Random Forest Classifier:")
print(rf_table)
print("Performance Metric for Decision Tree Classifier:")
print(dt_metrics_table)

Performance Metric for Random Forest Classifier:
+--------------------+-------------+----------------+
| Performance Metric | Testing Set | Validation Set |
+--------------------+-------------+----------------+
|      Accuracy      |     0.96    |      0.95      |
|     Precision      |     0.83    |      0.93      |
|       Recall       |     0.63    |      0.51      |
+--------------------+-------------+----------------+
Performance Metric for Decision Tree Classifier:
+--------------------+-------------+----------------+
| Performance Metric | Testing Set | Validation Set |
+--------------------+-------------+----------------+
|      Accuracy      |     0.92    |      0.9       |
|     Precision      |     0.58    |      0.54      |
|       Recall       |     0.66    |      0.6       |
+--------------------+-------------+----------------+


In [13]:
with open("p4_haritha_metrics_summary.txt", "w") as file:
    file.write("Performance Metrics for Random Forest Classifier:\n")
    file.write(str(rf_table))
    file.write("\n\nPerformance Metrics for Decision Tree Classifier:\n")
    file.write(str(dt_metrics_table))

In [14]:
import boto3
s3 = boto3.client('s3',region_name = 'eu-west-3')

In [15]:
s3_bucket_p4_haritha = "p4-haritha"
s3_folder_path = "p4_haritha_output/p4_haritha_metrics_summary.txt"
s3.upload_file('p4_haritha_metrics_summary.txt', s3_bucket_p4_haritha, s3_folder_path)
print("Metrics file uploaded successfully to the S3 Bucket with the specified file path:", s3_folder_path)

Metrics file uploaded successfully to the S3 Bucket with the specified file path: p4_haritha_output/p4_haritha_metrics_summary.txt


In [16]:
s3_classifier_path = "p4_haritha_output/Random Forest Classifier Model/predicted_quality_rf.csv" 
s3.upload_file('predicted_quality_rf.csv', s3_bucket_p4_haritha, s3_classifier_path)
print("Prediction results file uploaded successfully to the S3 Bucket with file path:", s3_classifier_path)

Prediction results file uploaded successfully to the S3 Bucket with file path: p4_haritha_output/Random Forest Classifier Model/predicted_quality_rf.csv


In [17]:
s3_classifier_path = "p4_haritha_output/Decision Tree Classifier Model/predicted_quality_dt.csv" 
s3.upload_file('predicted_quality_dt.csv', s3_bucket_p4_haritha, s3_classifier_path)
print("Prediction results file uploaded successfully to the S3 Bucket with file path:", s3_classifier_path)

Prediction results file uploaded successfully to the S3 Bucket with file path: p4_haritha_output/Decision Tree Classifier Model/predicted_quality_dt.csv
