In [3]:
import kagglehub
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download the Sentiment140 dataset from Kaggle
path = kagglehub.dataset_download("kazanova/sentiment140")

# Display the path to the downloaded dataset files
print("Path to dataset files:", path)


Path to dataset files: C:\Users\misss\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2


In [4]:
file_path = r"C:\Users\misss\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2\training.1600000.processed.noemoticon.csv"

# Define column names for the dataset
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Load the dataset using the specified encoding
data = pd.read_csv(file_path, encoding='latin-1', names=columns)

# Keep only the 'target' and 'text' columns
data = data[['target', 'text']]

# Convert sentiment labels to binary (0 = negative, 1 = positive)
data['target'] = data['target'].replace({4: 1})

# Display the first rows of the dataset
print(data.head())


   target                                               text
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       0  is upset that he can't update his Facebook by ...
2       0  @Kenichan I dived many times for the ball. Man...
3       0    my whole body feels itchy and like its on fire 
4       0  @nationwideclass no, it's not behaving at all....


In [5]:
data.shape

(1600000, 2)

In [6]:

nltk.download('stopwords', download_dir='C:/Users/misss/Formation/Projets/AnalyseSentiments/nltk_data')
nltk.download('punkt_tab', download_dir='C:/Users/misss/Formation/Projets/AnalyseSentiments/nltk_data')
nltk.download('wordnet', download_dir='C:/Users/misss/Formation/Projets/AnalyseSentiments/nltk_data')
nltk.download('word_tokenize', download_dir='C:/Users/misss/Formation/Projets/AnalyseSentiments/nltk_data')
nltk.data.path.append('C:/Users/misss/Formation/Projets/AnalyseSentiments/nltk_data')


[nltk_data] Downloading package stopwords to C:/Users/misss/Formation/
[nltk_data]     Projets/AnalyseSentiments/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt_tab to C:/Users/misss/Formation/
[nltk_data]     Projets/AnalyseSentiments/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to C:/Users/misss/Formation/Pr
[nltk_data]     ojets/AnalyseSentiments/nltk_data...
[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index


In [7]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
OOV_TOKEN = "<OOV>"


def clean_text(text):
    """Basic text cleaning: remove mentions, URLs and lowercase the text."""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    return text.lower()


def preprocess_tfidf(text):
    """Preprocess for TF-IDF: remove punctuation, lemmatize, handle stopwords and OOV."""
    if isinstance(text, str):
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) if word not in stop_words else OOV_TOKEN for word in tokens]
        return " ".join(tokens)
    return ""


def preprocess_lem_with_stopwords(text):
    """Lemmatization with stopwords, OOV token replacement, returns token list."""
    if isinstance(text, str):
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) if word not in stop_words else OOV_TOKEN for word in tokens]
        return tokens
    return []


def preprocess_lem_no_stopwords(text):
    """Lemmatization without stopwords, returns token list."""
    if isinstance(text, str):
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens
    return []


def preprocess_stem_with_stopwords(text):
    """Stemming with stopwords and OOV handling, returns token list."""
    if isinstance(text, str):
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) if word not in stop_words else OOV_TOKEN for word in tokens]
        return tokens
    return []


def preprocess_stem_no_stopwords(text):
    """Stemming without stopwords, returns token list."""
    if isinstance(text, str):
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return tokens
    return []


def preprocess_bert(text):
    """Minimal preprocessing for BERT: lowercase, replace mentions and URLs with OOV."""
    if isinstance(text, str):
        text = re.sub(r"@\w+", OOV_TOKEN, text)
        text = re.sub(r"http\S+", OOV_TOKEN, text)
        return text.lower()
    return text


data["text_cleaned"] = data["text"].apply(clean_text)
data["text_tfidf"] = data["text_cleaned"].apply(preprocess_tfidf)
data["text_lem_with_stopwords"] = data["text_cleaned"].apply(preprocess_lem_with_stopwords)
data["text_lem_no_stopwords"] = data["text_cleaned"].apply(preprocess_lem_no_stopwords)
data["text_stem_with_stopwords"] = data["text_cleaned"].apply(preprocess_stem_with_stopwords)
data["text_stem_no_stopwords"] = data["text_cleaned"].apply(preprocess_stem_no_stopwords)
data["text_bert"] = data["text"].apply(preprocess_bert)

print(data.head())


   target                                               text  \
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1       0  is upset that he can't update his Facebook by ...   
2       0  @Kenichan I dived many times for the ball. Man...   
3       0    my whole body feels itchy and like its on fire    
4       0  @nationwideclass no, it's not behaving at all....   

                                        text_cleaned  \
0    - awww, that's a bummer.  you shoulda got da...   
1  is upset that he can't update his facebook by ...   
2   i dived many times for the ball. managed to s...   
3    my whole body feels itchy and like its on fire    
4   no, it's not behaving at all. i'm mad. why am...   

                                          text_tfidf  \
0  awww thats <OOV> bummer <OOV> shoulda got davi...   
1  <OOV> upset <OOV> <OOV> cant update <OOV> face...   
2  <OOV> dived many time <OOV> <OOV> ball managed...   
3  <OOV> whole body feel itchy <OOV> like <OOV> <...  

In [8]:
# Export the cleaned dataset to a CSV file
data.to_csv('tweets_cleaned.csv', index=False)
