In [None]:
import zipfile
import os
import pandas as pd

# Path to the ZIP file
zip_path = r'C:\Users\USER\Downloads\Twitter-Sentiments.zip'
# Directory where you want to extract the files
extract_path = r'C:\Users\USER\Downloads\Twitter-Sentiments'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load the dataset (assuming it is a CSV file)
csv_file_path = os.path.join(extract_path, 'Tweets.csv')
tweets = pd.read_csv(csv_file_path)

# Display the first few rows of the dataset
tweets.head()


In [2]:
import pandas as pd

# Display basic information about the dataset
print(tweets.info())

# Display the first few rows of the dataset
print(tweets.head())

# Check for missing values
print(tweets.isnull().sum())

# Drop unnecessary columns
tweets_cleaned = tweets.drop(columns=['tweet_id', 'airline_sentiment_gold', 'negativereason_gold', 'tweet_coord', 'tweet_location', 'user_timezone'])

# Handle missing values
# For simplicity, we'll drop rows with missing values in 'airline_sentiment' and 'text'
tweets_cleaned = tweets_cleaned.dropna(subset=['airline_sentiment', 'text'])

# Display the first few rows of the cleaned dataset
print(tweets_cleaned.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [4]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stopwords and lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    
    # Join tokens back to a single string
    return ' '.join(tokens)

# Apply the preprocessing function to the 'text' column
tweets_cleaned['cleaned_text'] = tweets_cleaned['text'].apply(preprocess_text)

# Display the first few rows of the cleaned dataset
print(tweets_cleaned[['text', 'cleaned_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                        cleaned_text  
0                        virginamerica dhepburn said  
1  virginamerica plus youve added commercial expe...  
2  virginamerica didnt today must mean need take ...  
3  virginamerica really aggressive blast obnoxiou...  
4                 virginamerica really big bad thing  


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf_vectorizer.fit_transform(tweets_cleaned['cleaned_text'])

# Convert to DataFrame for better readability
X = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the shape of the resulting feature matrix
print(X.shape)


(14640, 5000)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Target variable
y = tweets_cleaned['airline_sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.79      0.94      0.86      1889
     neutral       0.63      0.39      0.48       580
    positive       0.75      0.53      0.62       459

    accuracy                           0.77      2928
   macro avg       0.72      0.62      0.66      2928
weighted avg       0.75      0.77      0.75      2928

Accuracy: 0.7670765027322405


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)


Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [None]:
print("Testing notebook environment.")
