In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Trumptweets/trum_tweet_sentiment_analysis.csv')

In [None]:
df.head(10)

**Helper Function for Text Cleaning:**

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline

**Build a Text Cleaning Pipeline**

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def text_cleaning_pipeline(dataset, rule = "lemmatize"):
  """
  This...
  """
  # Convert the input to small/lower order.
  data = dataset.lower()
  # Remove URLs
  data =re.sub(r"http\S+|www\S+|https\S+", '',data, flags=re.MULTILINE)
  # Remove emojis
  data = re.sub(r"["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags
                        u"\u2702-\u27B0"          # dingbats
                        u"\u24C2-\U0001F251"      # enclosed characters
                        "]+",
                        r' ', data, flags=re.UNICODE)
  # Remove all other unwanted characters.
  data = re.sub("[^0-9A-Za-z ]", "" , data)
  #Remove all mentions:
  data = re.sub("@[A-Za-z0-9_]+"," ", data)
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  stop_words = set(stopwords.words('english'))
  result_tokens = []
  for token in tokens:
    if token not in stop_words:
      result_tokens.append(token)
  if rule == "lemmatize":
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in result_tokens]
  elif rule == "stem":
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in result_tokens]
  else:
    print("Pick between lemmatize or stem")


  return " ".join(tokens)


**Text Classification using Machine Learning Models**

**📝 Instructions: Trump Tweet Sentiment Classification**

1. Load the Dataset
Load the dataset named "trump_tweet_sentiment_analysis.csv" using pandas. Ensure the dataset contains at least two columns: "text" and "label".

2.Text Cleaning and Tokenization
Apply a text preprocessing pipeline to the "text" column. This should include:

Lowercasing the text
Removing URLs, mentions, punctuation, and special characters
Removing stopwords
Tokenization (optional: stemming or lemmatization)
"Complete the above function"

3.Train-Test Split
Split the cleaned and tokenized dataset into training and testing sets using train_test_split from sklearn.model_selection.

4.TF-IDF Vectorization
Import and use the TfidfVectorizer from sklearn.feature_extraction.text to transform the training and testing texts into numerical feature vectors.

5. Model Training and Evaluation
Import Logistic Regression (or any machine learning model of your choice) from sklearn.linear_model. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.

Print the classification report using classification_report from sklearn.metrics.

In [18]:
df['clean_text'] = df['text'].apply(lambda x: text_cleaning_pipeline(x, rule = "lemmatize"))

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['Sentiment'], test_size=0.2, random_state=42
)

In [22]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [23]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96    248563
           1       0.94      0.91      0.92    121462

    accuracy                           0.95    370025
   macro avg       0.95      0.94      0.94    370025
weighted avg       0.95      0.95      0.95    370025

