### Import libraries and modules 

In [15]:
import pandas as pd


### Load the German language tweets text

In [16]:

# Load your German tweets data
german_file = pd.read_csv("/Users/hashimkhan/Documents/greenbootcamps/sentiment_analysis_X/merged_data/merged_data.csv")


### Display the first few rows


In [17]:
german_file.head()

Unnamed: 0,username,location,tweet_text,created_at,retweets,likes,language
0,massageportal24,Basel,Massageportal24 Anbieter Eintrag erstellen usw,2025-01-09 18:33:23+00:00,0,0,de
1,ChantalStadelm1,Luzern,Schtzt eure haltet euch konsequent von jeglich...,2025-01-09 17:49:15+00:00,0,1,de
2,berlinerzeitung,"Berlin, Deutschland",Adipositas tdliche Prionenkrankheiten Hitzestr...,2025-01-09 17:02:00+00:00,0,4,de
3,pp_gesundheit,no_location,DIOSynVax grndet neues Beratungsgremium um die...,2025-01-09 16:34:33+00:00,0,0,de
4,MichaPfahl,Germany,Gesundheitsfachkrfte Tauchen Sie in die Welt d...,2025-01-09 16:31:42+00:00,0,0,de


In [18]:
german_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   username    384 non-null    object
 1   location    384 non-null    object
 2   tweet_text  384 non-null    object
 3   created_at  384 non-null    object
 4   retweets    384 non-null    int64 
 5   likes       384 non-null    int64 
 6   language    384 non-null    object
dtypes: int64(2), object(5)
memory usage: 21.1+ KB


In [19]:
german_file.shape

(384, 7)

In [20]:
german_file.columns

Index(['username', 'location', 'tweet_text', 'created_at', 'retweets', 'likes',
       'language'],
      dtype='object')

### Load the Translation Model

In [21]:

from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

### Import Language Detection

In [22]:
from langdetect import detect

### Function for Translation from German to English

In [23]:

def translate_to_english(text):
    """
    Detect language and translate text to English if it's in German.
    """
    try:
        lang = detect(text)
        if lang == "de":
            # Prepare input for the model
            inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
            # Generate translation
            outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
            # Decode and return translated text
            translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return translated_text
        else:
            return text
    except Exception as e:
        return f"Error: {e}"

### Call the translate_to_english function - Give 'tweet_text' column as an Input argument

In [24]:

german_file["tweet_text_english"] = german_file["tweet_text"].apply(translate_to_english)

### An example of translated tweet 

In [25]:
row_300 = german_file.loc[301, ["tweet_text", "tweet_text_english"]]
print("Original German Tweet:")
print(row_300["tweet_text"])
print("\nTranslated English Tweet:")
print(row_300["tweet_text_english"])

Original German Tweet:
POV Linksautonome beschieen Polizisten am Tag der Rumung in Ltzerath mit Pyrotechnik Foto Christoph Hardt Panama Pictures

Translated English Tweet:
POV Left-wing autonomous police arrive on the day of rum in Ltzerath with pyrotechnics Photo Christoph Hardt Panama Pictures


### Save the translated file as "translatedFile.csv" in the /translated_data directory 

In [26]:
german_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   username            384 non-null    object
 1   location            384 non-null    object
 2   tweet_text          384 non-null    object
 3   created_at          384 non-null    object
 4   retweets            384 non-null    int64 
 5   likes               384 non-null    int64 
 6   language            384 non-null    object
 7   tweet_text_english  384 non-null    object
dtypes: int64(2), object(6)
memory usage: 24.1+ KB


In [None]:
german_file.to_csv('/Users/hashimkhan/Documents/greenbootcamps/sentiment_analysis_X/translated_data/translatedText.csv', index=False,encoding="utf-8")

<bound method DataFrame.info of             username             location  \
0    massageportal24                Basel   
1    ChantalStadelm1               Luzern   
2    berlinerzeitung  Berlin, Deutschland   
3      pp_gesundheit          no_location   
4         MichaPfahl              Germany   
..               ...                  ...   
379       jannikkel6               Berlin   
380          9Incide               Berlin   
381       pat4reason              Hamburg   
382          lud_nrw                  NRW   
383    HeinzK_Berlin          no_location   

                                            tweet_text  \
0       Massageportal24 Anbieter Eintrag erstellen usw   
1    Schtzt eure haltet euch konsequent von jeglich...   
2    Adipositas tdliche Prionenkrankheiten Hitzestr...   
3    DIOSynVax grndet neues Beratungsgremium um die...   
4    Gesundheitsfachkrfte Tauchen Sie in die Welt d...   
..                                                 ...   
379  bei Abrissarbeit

In [28]:
onlyTheEnglishText = german_file["tweet_text_english"]
onlyTheEnglishText.to_csv("/Users/hashimkhan/Documents/greenbootcamps/sentiment_analysis_X/translated_data/onlyEnglishTweets.csv")

In [38]:
english_tweets = pd.read_csv("/Users/hashimkhan/Documents/greenbootcamps/sentiment_analysis_X/translated_data/onlyEnglishTweets.csv")
from tabulate import tabulate
# Convert the dataframe to a pretty table
# Convert to a pretty table
english_tweets_table = english_tweets[["tweet_text_english"]]
rows_to_display = english_tweets_table.iloc[318:328]
print(tabulate(rows_to_display, headers='keys', tablefmt='fancy_grid'))

╒═════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│     │ tweet_text_english                                                                                                                                                                                                                                                                                             │
╞═════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ 318 │ He shot fireworks in apartment police arrests Arab in