In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Data Preprocessing
df = pd.read_csv('cereal.csv')
imputer = SimpleImputer(strategy='mean')
df[['potass']] = imputer.fit_transform(df[['potass']])


label_encoder = LabelEncoder()
df['mfr'] = label_encoder.fit_transform(df['mfr'])
df['type'] = label_encoder.fit_transform(df['type'])


scaler = StandardScaler()
numerical_features = ['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'weight', 'cups', 'rating']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df.head()
df.to_csv('cereal_modified.csv', index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Data Exploration and Visualization
plt.figure(figsize=(10, 6))
sns.histplot(df['calories'], kde=True)
plt.title('Distribution of Calories in Cereal')


sns.pairplot(df)


plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['calories', 'protein', 'fat', 'sodium']])
plt.title('Boxplot of Nutritional Features')
plt.show()

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

# Feature Selection and Dimensionality Reduction
lasso = Lasso(alpha=0.01)
lasso.fit(df[numerical_features], df['rating'])
lasso_selected_features = df[numerical_features].columns[(lasso.coef_ != 0)]
print("Lasso selected features:", lasso_selected_features)


rfe = RFE(estimator=lasso, n_features_to_select=5)
rfe.fit(df[numerical_features], df['rating'])
rfe_selected_features = df[numerical_features].columns[rfe.support_]
print("RFE selected features:", rfe_selected_features)


pca = PCA(n_components=5)
pca_components = pca.fit_transform(df[numerical_features])
print("Explained variance ratio by PCA components:", pca.explained_variance_ratio_)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Classification Techniques
X = df[numerical_features]
y = df['rating'] > df['rating'].mean()  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)


dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
y_pred_dec_tree = dec_tree.predict(X_test)


print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Precision:", precision_score(y_test, y_pred_log_reg))
print("Recall:", recall_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

print("\nDecision Tree Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_dec_tree))
print("Precision:", precision_score(y_test, y_pred_dec_tree))
print("Recall:", recall_score(y_test, y_pred_dec_tree))
print(classification_report(y_test, y_pred_dec_tree))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt

#Advanced Classification Methods
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)


models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM']
accuracies = [
    accuracy_score(y_test, y_pred_log_reg),
    accuracy_score(y_test, y_pred_dec_tree),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_svm)
]


plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Comparison of Classifier Accuracies')
plt.ylabel('Accuracy')
plt.show()


print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

print("\nSVM Performance:")
print(classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

#Testing and Model Evaluation
kf = KFold(n_splits=5, shuffle=True, random_state=42)


log_reg_cv_scores = cross_val_score(log_reg, X, y, cv=kf, scoring='accuracy')
print(f"Logistic Regression Cross-Validation Accuracy: {log_reg_cv_scores.mean():.2f}")


dec_tree_cv_scores = cross_val_score(dec_tree, X, y, cv=kf, scoring='accuracy')
print(f"Decision Tree Cross-Validation Accuracy: {dec_tree_cv_scores.mean():.2f}")


rf_cv_scores = cross_val_score(rf_clf, X, y, cv=kf, scoring='accuracy')
print(f"Random Forest Cross-Validation Accuracy: {rf_cv_scores.mean():.2f}")


svm_cv_scores = cross_val_score(svm_clf, X, y, cv=kf, scoring='accuracy')
print(f"SVM Cross-Validation Accuracy: {svm_cv_scores.mean():.2f}")


conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("\nRandom Forest Evaluation Metrics:")
print(f"Confusion Matrix:\n{conf_matrix_rf}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1-Score: {f1_rf:.2f}")