In [None]:
pip uninstall cmake


In [None]:
!pip install tensorflow_datasets
!pip install datasets

In [None]:
import tensorflow_datasets as tfds

datasets = tfds.list_builders()
print("Available Datasets:", datasets)

In [None]:
# Search for datasets containing the keyword "imdb_reviews"
keyword = "imdb_reviews"
filtered_datasets = [ds for ds in datasets if keyword in ds]
print("Filtered Datasets:", filtered_datasets)


In [None]:
# Load the IMDB dataset
data, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

# Check dataset metadata
print(info)


In [None]:
print("Number of training examples:", info.splits['train'].num_examples)
print("Number of testing examples:", info.splits['test'].num_examples)


In [None]:
import tensorflow_datasets as tfds

# Check the available splits (train, test, etc.)
train_data = data['train']  # This gives you the training data
test_data = data['test']    # This gives you the test data

# Optionally print the dataset info
print(info)


In [None]:
import pandas as pd

# Extract examples from the train dataset
train_examples = [{'Text': text.numpy().decode('utf-8'), 'Sentiment': int(label.numpy())} for text, label in train_data]
df = pd.DataFrame(train_examples)

# Convert to DataFrame
df = pd.DataFrame(train_examples)




# Data exploration

In [None]:
print("First 5 rows:")
print(df.head())


In [None]:
print("Shape of the DataFrame:", df.shape)


In [None]:
print("Columns in the DataFrame:", df.columns)


In [None]:
print("Missing values per column:")
print(df.isnull().sum())


In [None]:
print("Duplicated entries:", df.duplicated().sum())


In [None]:
# Show the duplicated rows
print("Duplicated rows:\n", df[df.duplicated()])


In [None]:
# Remove duplicated rows
df_cleaned = df.drop_duplicates()

# Check the shape after removing duplicates
print("Shape after removing duplicates:", df_cleaned.shape)


In [None]:
# Verify that there are no duplicates left
print("Duplicated rows after removing:", df_cleaned.duplicated().sum())


In [None]:
# Display detailed information about the DataFrame (data types, non-null counts)
print("Dataset info:")

df_cleaned.info()

# Data cleaning

In [None]:
import re

# Function to clean text (remove URLs, mentions, special characters, etc.)
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove Twitter mentions (@username)
    text = re.sub(r"@\w+", "", text)
    # Remove non-alphabetical characters and keep spaces (e.g., punctuation)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Optionally, convert text to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the 'Text' column
df_cleaned['Text'] = df_cleaned['Text'].apply(clean_text)

# Display the first few cleaned rows
print(df_cleaned.head())


In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import matplotlib.pyplot as plt  # Import matplotlib.pyplot
import seaborn as sns

# Download the NLTK punkt tokenizer if not already done
nltk.download('punkt')

# Tokenize the words
all_words = ' '.join(df_cleaned['Text'])
words = word_tokenize(all_words)

# Count the frequency of each word
word_counts = Counter(words)

# Get the 10 most common words
common_words = word_counts.most_common(10)

# Prepare data for plotting
words, counts = zip(*common_words)

# Plot the top 10 words
plt.figure(figsize=(10, 6))
sns.barplot(x=list(counts), y=list(words), palette='viridis')
plt.title("Top 10 Most Frequent Words in Reviews")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()


In [None]:
# Add a new column to store the length of each review
df_cleaned['Review_Length'] = df_cleaned['Text'].apply(len)

# Plot the distribution of review lengths
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['Review_Length'], kde=True, color='blue', bins=30)
plt.title("Distribution of Review Lengths")
plt.xlabel("Review Length (Number of Characters)")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Scatter plot: Review Length vs Sentiment
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Review_Length', y='Sentiment', data=df_cleaned, hue='Sentiment', palette='coolwarm', alpha=0.6)
plt.title("Sentiment vs Review Length")
plt.xlabel("Review Length (Number of Characters)")
plt.ylabel("Sentiment (0 = Negative, 1 = Positive)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Box plot: Review Length vs Sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='Sentiment', y='Review_Length', data=df_cleaned, palette='coolwarm')
plt.title("Review Length Distribution by Sentiment")
plt.xlabel("Sentiment (0 = Negative, 1 = Positive)")
plt.ylabel("Review Length (Number of Characters)")
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Pairplot to visualize relationships between columns in the dataset
sns.pairplot(df_cleaned[['Review_Length', 'Sentiment']])
plt.title("Pairplot of Review Length vs Sentiment")
plt.show()


In [None]:
# Define a maximum threshold for review length (e.g., 2000 characters)
max_review_length = 2000

# Remove reviews with length exceeding the threshold
df_cleaned = df_cleaned[df_cleaned['Review_Length'] <= max_review_length]

# Check the new shape of the dataset
print("Shape after removing outliers:", df_cleaned.shape)


In [None]:
print("Max Review Length after cleaning:", df_cleaned['Review_Length'].max())


In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

df_cleaned['Text'] = df_cleaned['Text'].apply(remove_stopwords)
print("Sample text without stopwords:", df_cleaned['Text'].iloc[0])


In [None]:
def remove_stopwords(text):
    if isinstance(text, str):  # Process only if it's a string
        return ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text  # Return the input unchanged if not a string


In [None]:
# Encode sentiment labels (assuming the sentiment column is named 'Sentiment')
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_cleaned['Sentiment'] = encoder.fit_transform(df_cleaned['Sentiment'])

# Verify encoding
print(df_cleaned['Sentiment'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split

# Feature (Text) and Target (Sentiment)
X = df_cleaned['Text']
y = df_cleaned['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {len(X_train)}")
print(f"Testing data size: {len(X_test)}")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF matrix shape (training): {X_train_tfidf.shape}")
print(f"TF-IDF matrix shape (testing): {X_test_tfidf.shape}")


In [None]:
from sklearn.linear_model import LogisticRegression

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Check training accuracy
train_accuracy = model.score(X_train_tfidf, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")


# Evaluate the model

In [None]:
# Test accuracy
test_accuracy = model.score(X_test_tfidf, y_test)
print(f"Testing Accuracy: {test_accuracy:.2f}")


In [None]:
# Predict test labels
y_pred = model.predict(X_test_tfidf)

# Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


# ML Flow


In [None]:
!pip install mlflow


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming your data preprocessing and train-test split code are already done:
# X_train_tfidf, X_test_tfidf, y_train, y_test

# Start MLFlow run for logging
with mlflow.start_run():
    # Log model parameters
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_param("max_iter", 100)  # Example: max_iter for Logistic Regression
    
    # Train the Logistic Regression model (this can be the model you're already training)
    model = LogisticRegression(max_iter=100)
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions and calculate accuracy (you can keep this as your evaluation step)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log the accuracy metric
    mlflow.log_metric("accuracy", accuracy)
    
    # Log the trained model (you can also log the model as you're doing here)
    mlflow.sklearn.log_model(model, "logistic_regression_model")
    
    # Optionally: Log confusion matrix as an artifact (use your existing confusion matrix code)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title("Confusion Matrix")
    
    # Save confusion matrix as an image and log it as an artifact
    cm_path = "confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    
    print(f"Model trained and logged with accuracy: {accuracy}")
