In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [23]:
dataset_url = "/content/drive/MyDrive/AI last sem/week8/Copy of trum_tweet_sentiment_analysis.csv"

In [24]:
import pandas as pd
df = pd.read_csv(dataset_url)

In [28]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pandas as pd

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# --- Helper Functions ---

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def removeunwanted_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def remove_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

# --- Text Cleaning Pipeline ---

def text_cleaning_pipeline(dataset, rule="lemmatize"):
    """
    Cleans the 'text' column in a DataFrame.
    rule: 'lemmatize' or 'stem'
    Returns: a Series with cleaned text
    """
    data = dataset["text"].astype(str).str.lower()
    data = data.apply(remove_urls)
    data = data.apply(remove_emoji)
    data = data.apply(removeunwanted_characters)
    data = data.apply(remove_punct)

    # Tokenize
    data = data.apply(lambda x: x.split())

    # Remove stopwords
    data = data.apply(lambda tokens: [t for t in tokens if t not in stop_words])

    # Lemmatize or Stem
    if rule == "lemmatize":
        data = data.apply(lemmatization)
    elif rule == "stem":
        data = data.apply(stemming)
    else:
        raise ValueError("Choose rule as either 'lemmatize' or 'stem'")

    # Join back to string
    return data.apply(lambda tokens: " ".join(tokens))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850123 entries, 0 to 1850122
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   text       object
 1   Sentiment  int64 
dtypes: int64(1), object(1)
memory usage: 28.2+ MB


In [29]:
df = pd.read_csv(dataset_url)
cleaned_text = text_cleaning_pipeline(df, rule="lemmatize")
print(cleaned_text.head())


0    rt johnleguizamo trump draining swamp taxpayer...
1    icymi hacker rig fm radio station play antitru...
2    trump protest lgbtq rally new york bbcworld vi...
3    hi im pier morgan david beckham awful donald t...
4    rt glennfranco tech firm suing buzzfeed publis...
Name: text, dtype: object


In [30]:
from sklearn.model_selection import train_test_split

# Assuming 'data' is your cleaned text data and 'df' is your original DataFrame
X_train, X_test, y_train, y_test = train_test_split(cleaned_text, df['Sentiment'], test_size=0.2, random_state=42) # Example test_size and random_state


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Training and Evaluation
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96    248563
           1       0.94      0.91      0.93    121462

    accuracy                           0.95    370025
   macro avg       0.95      0.94      0.95    370025
weighted avg       0.95      0.95      0.95    370025



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
