<div class="alert alert-block alert-info">

## <center> <b> Stock Sentiment </center>
## <center> Predicting market behavior from tweets </center> <br>
##  <center> <b> EDA </center> <br>
## <center> Spring Semester 2024-2025 <center>

<center> Group 35: <center>
<center>José Cavaco, .... <br><center>
<center> ?? <br><center>
<center>Matilde Miguel, 20240549 <br><center>
<center>Joana Esteves, 20240746 <br><center>
<center>Rita Serra, 20240515 <br><center>

<div>

**Table of Contents**
- [1. Import the needed Libraries](#importlibraries)
- [2. Import Dataset](#importdataset)
- [3. Exploratory Analysis](#section_3)
- [4. Preprocessing](#preprocessing)


<a class="anchor" id="importlibraries">

# 1. Import the needed Libraries

</a>

In [1]:
#!pip install langdetect
#!pip install transformers sentencepiece


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from langdetect import detect

#Preprocessing
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect

  from .autonotebook import tqdm as notebook_tqdm


<a class="anchor" id="importdataset">

# 2. Import the dataset

</a>

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
train_df.head()

Unnamed: 0,text,label,language
0,$BYND - JPMorgan reels in expectations on Beyo...,0,en
1,$CCL $RCL - Nomura points to bookings weakness...,0,en
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0,en
3,$ESS: BTIG Research cuts to Neutral https://t....,0,en
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0,da


<a class="anchor" id="section_3">

# 3. Exploratory Analysis

</a>

In [5]:
train_df.shape

(9543, 3)

### Missing Values

In [6]:
train_df.isna().sum()

text        0
label       0
language    0
dtype: int64

<a class="anchor" id="preprocessing">

# 4. Preprocessing

</a>

### Languages

In [7]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [8]:
train_df['language'] = train_df['text'].apply(detect_language)
test_df['language'] = test_df['text'].apply(detect_language)

In [9]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

print(train_df['language'].value_counts())

language
en         8840
ca          115
de          111
es           88
fr           84
sv           58
da           43
no           36
nl           33
it           26
af           23
tl           17
pt           13
et            9
id            9
pl            7
vi            5
unknown       4
ro            4
fi            2
sl            2
sw            2
hu            2
so            2
cy            2
cs            1
zh-cn         1
hr            1
sk            1
ja            1
sq            1
Name: count, dtype: int64


In [10]:
print(test_df['language'].value_counts())

language
en    2234
ca      37
es      25
fr      20
de      15
sv      11
no       8
nl       7
tl       6
da       5
af       5
it       4
et       3
cy       2
so       2
pl       1
pt       1
sw       1
vi       1
Name: count, dtype: int64


In [12]:
# Load translation model
def load_translator(model_name):
    return pipeline("translation", model=model_name, device=0)  # device=0 for GPU


# Function to translate 
def translate_texts(texts, languages, translators, batch_size=64):    
    translated_texts = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_langs = languages[i:i + batch_size]
        
        batch_result = []
        for text, lang in zip(batch_texts, batch_langs):
            if lang == 'en':
                batch_result.append(text)  # Already English
                continue

            translated = None
            for translator in translators:
                try:
                    result = translator(text, max_length=512)
                    translated_text = result[0]['translation_text']
                    translated = translated_text
                    break
                except Exception:
                    continue  # Try next translator

            batch_result.append(translated if translated else text)  # Fallback to original if all fail
        
        translated_texts.extend(batch_result)
    
    return translated_texts


#  Translate and save datasets
def translate_and_save(train_path, test_path, model_names,
                       train_out='train_translated.csv', test_out='test_translated.csv'):
    translators = [load_translator(name) for name in model_names]
    
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)


    # Translate train dataset
    train_translated = translate_texts(
        train_df['text'].tolist(),
        train_df['language'].tolist(),
        translators)
    train_df['text_translated'] = train_translated
    train_df['language_translated'] = train_df['text_translated'].apply(detect_language)

    # Translate test dataset
    test_translated = translate_texts(
        test_df['text'].tolist(),
        test_df['language'].tolist(),
        translators)
    test_df['text_translated'] = test_translated
    test_df['language_translated'] = test_df['text_translated'].apply(detect_language)

    train_df.to_csv(train_out, index=False)
    test_df.to_csv(test_out, index=False)

# Models 
model_list = [
    'Helsinki-NLP/opus-mt-mul-en',
    'facebook/m2m100_418M',
    'facebook/nllb-200-distilled-600M']

translate_and_save('train.csv', 'test.csv', model_names=model_list)

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [13]:
train_en_df = pd.read_csv('train_translated.csv')
test_en_df = pd.read_csv('test_translated.csv')

In [14]:
print(train_en_df['language_translated'].value_counts())

language_translated
en         9050
de           84
ca           75
fr           54
sv           46
es           45
da           31
it           31
nl           23
no           17
af           15
pt           12
ro           11
pl            9
tl            8
et            7
id            6
so            3
sl            3
vi            3
unknown       3
cy            2
el            1
lt            1
sk            1
sw            1
sq            1
Name: count, dtype: int64


In [15]:
print(test_en_df['language_translated'].value_counts())

language_translated
en         2279
ca           25
de           15
es           11
fr           10
ro            6
da            6
sv            6
tl            6
no            5
af            4
nl            4
pt            2
cy            2
it            2
so            1
sw            1
unknown       1
vi            1
et            1
Name: count, dtype: int64


In [17]:
train_en_df[train_en_df['language_translated'] != 'en'].sort_values(by='language_translated')

Unnamed: 0,text,label,language,text_translated,language_translated
1202,"AEP seeking wind, solar energy in PJM",2,af,"AEP looking wind, solar energy in PJM",af
5786,UK watchdog probing Google-Looker deal,0,af,UK watchdog testing Google-looker deal,af
9146,$BB keeps going higher on volume.,1,en,$BB keeps going higher on volume.,af
8167,$CEI Woke up,2,hr,$CEI Week up,af
1322,"Boeing, Airbus kept in suspense over big Dubai...",2,af,"Boeing, Airbus keeps in suspension over big Du...",af
...,...,...,...,...,...
4683,https://t.co/oJxNPEUpWq,2,unknown,https://t.co/oJxNPEUPWq,unknown
4681,https://t.co/575AH1YRkF,2,unknown,https://t.co/575AH1YRkF,unknown
8616,HSAC,2,vi,HSAC,vi
8590,HOML,2,vi,HOML,vi


In [20]:
# Filtrar não-inglês e adicionar coluna com tamanho do texto
nao_en_ordenado = train_en_df[train_en_df['language_translated'] != 'en'].copy()
nao_en_ordenado['text_length'] = nao_en_ordenado['text'].str.len()

# Ordenar por 'language_translated' e depois por comprimento decrescente
nao_en_ordenado = nao_en_ordenado.sort_values(by=['language_translated', 'text_length'], ascending=[True, True])

# Ver resultado
nao_en_ordenado[['language_translated', 'text_length', 'text']].head()


Unnamed: 0,language_translated,text_length,text
8167,af,12,$CEI Woke up
9146,af,33,$BB keeps going higher on volume.
2440,af,36,U.S. Bancorp declares $0.42 dividend
1202,af,37,"AEP seeking wind, solar energy in PJM"
5786,af,38,UK watchdog probing Google-Looker deal


In [None]:
# Preprossing Rita

def basic_clean(text):
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)   # Remove URLs
    text = re.sub(r'@\w+', '', text)                      # Remove @mentions
    text = re.sub(r'#\w+', '', text)                      # Remove hashtags
    text = re.sub(r'$[A-Z]{1,6}\b', '', text)            # Remove tickers like $AAPL
    return text.strip()

train_df["clean_text"] = train_df["text"].apply(basic_clean)
import re

elongation_pattern = re.compile(r"\b\w(\w)\1{2,}\w\b")

def get_elongated_words(text):
    return [match.group(0) for match in elongation_pattern.finditer(str(text))]

all_elongated_words = (
    train_df["clean_text"]
    .apply(get_elongated_words)
    .explode()
    .dropna()
    .tolist()
)

def is_valid_elongation(word):
    return (
        word.isalpha()                 # only letters
        and len(word) > 3             # long enough
        and not word.isupper()        # not all caps (likely tickers)
    )

valid_elongated_words = list({w for w in all_elongated_words if is_valid_elongation(w)})

print("✅ Valid elongated words that will be corrected:")
print(valid_elongated_words[:20])
def reduce_elongation(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)
def fix_valid_elongations(text, elong_words):
    for word in elong_words:
        if word in text:
            fixed = reduce_elongation(word)
            text = text.replace(word, fixed)
    return text

train_df["clean_text"] = train_df["clean_text"].apply(
    lambda x: fix_valid_elongations(str(x), valid_elongated_words)
)