In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('Assignment Data.csv', encoding='latin1')

# Data Cleaning
df['description'] = df['description'].fillna('')

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply text preprocessing to features
df['clean_description'] = df['description'].apply(preprocess_text)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(df['variety'])
y = to_categorical(y_encoded)  # For multi-class classification

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df['clean_description']).toarray()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
# Build ANN model
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0)),
model.add(Dense(len(le.classes_), activation='softmax'))  # Output layer


In [16]:

# Compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)



In [17]:
# Train model
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=128,
    validation_split=0.2,
    verbose=1
)


Epoch 1/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - accuracy: 0.1465 - loss: 4.0335 - val_accuracy: 0.3868 - val_loss: 2.5983
Epoch 2/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 20ms/step - accuracy: 0.3938 - loss: 2.5816 - val_accuracy: 0.4546 - val_loss: 2.2542
Epoch 3/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.4547 - loss: 2.2610 - val_accuracy: 0.4793 - val_loss: 2.1170
Epoch 4/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.4877 - loss: 2.0702 - val_accuracy: 0.5021 - val_loss: 2.0292
Epoch 5/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.5162 - loss: 1.9524 - val_accuracy: 0.5190 - val_loss: 1.9848
Epoch 6/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.5412 - loss: 1.8344 - val_accuracy: 0.5270 - val_loss: 1.9498
Epoch 7/20
[1m6

In [19]:

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5588 - loss: 1.9777
Test Loss: 2.0055, Test Accuracy: 0.5552


In [20]:

# Prediction function for new data
def predict_variety(new_description):
    cleaned_text = preprocess_text(new_description)
    vectorized = tfidf.transform([cleaned_text]).toarray()
    prediction = model.predict(vectorized)
    return le.inverse_transform([np.argmax(prediction)])[0]


In [32]:

# Example prediction
new_wine_description = input("For Example: A full-bodied red with dark berry flavors and hints of oak\n" + "\t    ")
predicted_variety = predict_variety(new_wine_description)
print(f"Predicted Wine Variety: {predicted_variety}")

For Example: A full-bodied red with dark berry flavors and hints of oak
	     A full-bodied red with dark berry flavors and hints of oak


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Predicted Wine Variety: Pinot Noir
