In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from colorama import Fore, Style, init
from prettytable import PrettyTable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize colorama
init(autoreset=True)

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# File path
csv_file_path = "E:\\New folder\\archive (2)\\training.1600000.processed.noemoticon.csv"

# Try to read the dataset with different encodings
encodings = ['latin1', 'iso-8859-1', 'cp1252']
for encoding in encodings:
    try:
        twitter_data = pd.read_csv(csv_file_path, encoding=encoding)
        print(Fore.GREEN + f"✅ Successfully read the file with {encoding} encoding.")
        break
    except UnicodeDecodeError:
        print(Fore.RED + f"❌ Failed to read the file with {encoding} encoding.")

# Rename columns
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv(csv_file_path, names=column_names, encoding='ISO-8859-1')

# Display head of the dataframe in a table
print(Fore.YELLOW + "📊 DataFrame Head:")
table = PrettyTable()
table.field_names = twitter_data.head(5).columns.tolist()
for row in twitter_data.head(5).values:
    table.add_row(row)
print(table)

# Check for missing values
print(Fore.YELLOW + "🔍 Checking for missing values...")
missing_values = twitter_data.isnull().sum()
print(missing_values)

# Check the distribution of target values
print(Fore.YELLOW + "📈 Distribution of target values:")
target_distribution = twitter_data['target'].value_counts()
print(target_distribution)

# Replace target values (4 -> 1)
twitter_data.replace({'target': {4: 1}}, inplace=True)

# Improved Stemming function using SnowballStemmer
stemmer = nltk.SnowballStemmer('english')

def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)
    content = content.lower()
    content = content.split()
    content = [stemmer.stem(word) for word in content if word not in stop_words]
    content = ' '.join(content)
    return content

# Apply stemming to the text column in batches
print(Fore.YELLOW + "⚙️ Applying stemming to the text column...")
batch_size = 1000
num_batches = len(twitter_data) // batch_size + 1
stemmed_texts = []

for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(twitter_data))
    batch_texts = [stemming(text) for text in twitter_data['text'].iloc[start_idx:end_idx]]
    stemmed_texts.extend(batch_texts)

twitter_data['text'] = stemmed_texts

# Vectorize the text data with HashingVectorizer
print(Fore.YELLOW + "📏 Vectorizing the text data...")
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(twitter_data['text'])
y = twitter_data['target'].values

# Split the data
print(Fore.YELLOW + "✂️ Splitting the data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
print(Fore.YELLOW + "🤖 Training the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test data
print(Fore.YELLOW + "🔍 Predicting on test data...")
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(Fore.GREEN + f"🎯 Accuracy: {accuracy * 100:.2f}%")

# Function to predict sentiment of a tweet
def predict_sentiment(tweet):
    tweet = stemming(tweet)
    tweet_vector = vectorizer.transform([tweet])
    prediction = model.predict(tweet_vector)
    sentiment = "Positive 😊" if prediction == 1 else "Negative 😠"
    return sentiment

# User-friendly interaction
def main():
    print(Fore.YELLOW + "🔮 Welcome to the Tweet Sentiment Analyzer!")
    while True:
        tweet = input(Fore.CYAN + "Enter a tweet (or type 'exit' to quit): ")
        if tweet.lower() == 'exit':
            print(Fore.YELLOW + "👋 Goodbye!")
            break
        sentiment = predict_sentiment(tweet)
        color = Fore.GREEN if "Positive" in sentiment else Fore.RED
        print(color + f"Sentiment: {sentiment}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Successfully read the file with latin1 encoding.
📊 DataFrame Head:
+--------+------------+------------------------------+----------+-----------------+---------------------------------------------------------------------------------------------------------------------+
| target |     id     |             date             |   flag   |       user      |                                                         text                                                        |
+--------+------------+------------------------------+----------+-----------------+---------------------------------------------------------------------------------------------------------------------+
|   0    | 1467810369 | Mon Apr 06 22:19:45 PDT 2009 | NO_QUERY | _TheSpecialOne_ | @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D |
|   0    | 1467810672 | Mon Apr 06 22:19:49 PDT 2009 | NO_QUERY |  scotthamilton  |   is upset that he can't update his Fac

100%|██████████████████████████████████████████████████████████████████████████████| 1601/1601 [02:56<00:00,  9.07it/s]


📏 Vectorizing the text data...
✂️ Splitting the data into training and test sets...
🤖 Training the Logistic Regression model...
🔍 Predicting on test data...
🎯 Accuracy: 76.85%
🔮 Welcome to the Tweet Sentiment Analyzer!


[36mEnter a tweet (or type 'exit' to quit):  k


Sentiment: Positive 😊


[36mEnter a tweet (or type 'exit' to quit):  fuck


Sentiment: Negative 😠


[36mEnter a tweet (or type 'exit' to quit):  exit


👋 Goodbye!


In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from colorama import Fore, Style, init
from prettytable import PrettyTable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize colorama
init(autoreset=True)

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# File path
csv_file_path = "E:\\New folder\\archive (2)\\training.1600000.processed.noemoticon.csv"

# Try to read the dataset with different encodings
encodings = ['latin1', 'iso-8859-1', 'cp1252']
for encoding in encodings:
    try:
        twitter_data = pd.read_csv(csv_file_path, encoding=encoding)
        print(Fore.GREEN + f"✅ Successfully read the file with {encoding} encoding.")
        break
    except UnicodeDecodeError:
        print(Fore.RED + f"❌ Failed to read the file with {encoding} encoding.")


✅ Successfully read the file with latin1 encoding.


In [3]:
# Rename columns
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv(csv_file_path, names=column_names, encoding='ISO-8859-1')


In [4]:
# Display head of the dataframe in a table
print(Fore.YELLOW + "📊 DataFrame Head:")
table = PrettyTable()
table.field_names = twitter_data.head(5).columns.tolist()
for row in twitter_data.head(5).values:
    table.add_row(row)
print(table)


📊 DataFrame Head:
+--------+------------+------------------------------+----------+-----------------+---------------------------------------------------------------------------------------------------------------------+
| target |     id     |             date             |   flag   |       user      |                                                         text                                                        |
+--------+------------+------------------------------+----------+-----------------+---------------------------------------------------------------------------------------------------------------------+
|   0    | 1467810369 | Mon Apr 06 22:19:45 PDT 2009 | NO_QUERY | _TheSpecialOne_ | @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D |
|   0    | 1467810672 | Mon Apr 06 22:19:49 PDT 2009 | NO_QUERY |  scotthamilton  |   is upset that he can't update his Facebook by texting it... and might cry as a result  S

In [5]:
# Check for missing values
print(Fore.YELLOW + "🔍 Checking for missing values...")
missing_values = twitter_data.isnull().sum()
print(missing_values)


🔍 Checking for missing values...
target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64


In [6]:
# Check the distribution of target values
print(Fore.YELLOW + "📈 Distribution of target values:")
target_distribution = twitter_data['target'].value_counts()
print(target_distribution)


📈 Distribution of target values:
target
0    800000
4    800000
Name: count, dtype: int64


In [7]:
# Replace target values (4 -> 1)
twitter_data.replace({'target': {4: 1}}, inplace=True)


In [8]:
# Improved Stemming function using SnowballStemmer
stemmer = nltk.SnowballStemmer('english')

def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)
    content = content.lower()
    content = content.split()
    content = [stemmer.stem(word) for word in content if word not in stop_words]
    content = ' '.join(content)
    return content


In [9]:
# Apply stemming to the text column in batches
print(Fore.YELLOW + "⚙️ Applying stemming to the text column...")
batch_size = 1000
num_batches = len(twitter_data) // batch_size + 1
stemmed_texts = []

for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(twitter_data))
    batch_texts = [stemming(text) for text in twitter_data['text'].iloc[start_idx:end_idx]]
    stemmed_texts.extend(batch_texts)

twitter_data['text'] = stemmed_texts


⚙️ Applying stemming to the text column...


100%|██████████████████████████████████████████████████████████████████████████████| 1601/1601 [02:00<00:00, 13.30it/s]


In [10]:
# Vectorize the text data with TfidfVectorizer
print(Fore.YELLOW + "📏 Vectorizing the text data...")
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(twitter_data['text'])
y = twitter_data['target'].values


📏 Vectorizing the text data...


In [11]:
# Split the data
print(Fore.YELLOW + "✂️ Splitting the data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


✂️ Splitting the data into training and test sets...


In [12]:
# Model training
print(Fore.YELLOW + "🤖 Training the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


🤖 Training the Logistic Regression model...


In [13]:
# Predict on test data
print(Fore.YELLOW + "🔍 Predicting on test data...")
y_pred = model.predict(X_test)


🔍 Predicting on test data...


In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(Fore.GREEN + f"🎯 Accuracy: {accuracy * 100:.2f}%")

# Function to predict sentiment of a tweet
def predict_sentiment(tweet):
    tweet = stemming(tweet)
    tweet_vector = vectorizer.transform([tweet])
    prediction = model.predict(tweet_vector)
    sentiment = "Positive 😊" if prediction == 1 else "Negative 😠"
    return sentiment


🎯 Accuracy: 76.85%


In [None]:
# User-friendly interaction
def main():
    print(Fore.YELLOW + "🔮 Welcome to the Tweet Sentiment Analyzer!")
    while True:
        tweet = input(Fore.CYAN + "Enter a tweet (or type 'exit' to quit): ")
        if tweet.lower() == 'exit':
            print(Fore.YELLOW + "👋 Goodbye!")
            break
        sentiment = predict_sentiment(tweet)
        color = Fore.GREEN if "Positive" in sentiment else Fore.RED
        print(color + f"Sentiment: {sentiment}")

if __name__ == "__main__":
    main()


🔮 Welcome to the Tweet Sentiment Analyzer!
