In [1]:
import pandas as pd
import numpy as np
import fasttext
from transformers import pipeline
import swifter
from tqdm import tqdm
from pandarallel import pandarallel
from transformers import MarianMTModel, MarianTokenizer
import time
import warnings
warnings.filterwarnings('ignore')

  from pandas.core import (


In [2]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


# Read data from csv file

In [3]:
# read data from csv file
df = pd.read_csv(r"..\data\row_data\News.csv" ,usecols=['Supplier_Name',
'News_Title','News_Link','News_Date','Description','Initial_Classification','Machine_Learning',
'Confidence','Trusted_Language'])

# Data Pre-processing

In [4]:
# check lenght of dataframe
len(df.index)

714029

In [5]:
# fast information check for each column in dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714029 entries, 0 to 714028
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Supplier_Name           714029 non-null  object 
 1   News_Title              707263 non-null  object 
 2   News_Link               702215 non-null  object 
 3   News_Date               610048 non-null  object 
 4   Description             17320 non-null   object 
 5   Initial_Classification  714029 non-null  object 
 6   Machine_Learning        714029 non-null  object 
 7   Confidence              714029 non-null  float64
 8   Trusted_Language        371213 non-null  object 
dtypes: float64(1), object(8)
memory usage: 49.0+ MB


In [6]:
# check list of columns in dataframe 
df.columns

Index(['Supplier_Name', 'News_Title', 'News_Link', 'News_Date', 'Description',
       'Initial_Classification', 'Machine_Learning', 'Confidence',
       'Trusted_Language'],
      dtype='object')

In [7]:
# print first 3 rows in dataframe
df.head(3)

Unnamed: 0,Supplier_Name,News_Title,News_Link,News_Date,Description,Initial_Classification,Machine_Learning,Confidence,Trusted_Language
0,European-Mediterranean Seismological Centre,CENTRAL TURKEY,https://www.emsc-csem.org/Earthquake/earthquak...,2023-01-02 16:47:21.4,,No Classification Found,can't predict | insufficient description,100.0,
1,European-Mediterranean Seismological Centre,CENTRAL TURKEY,https://www.emsc-csem.org/Earthquake/earthquak...,2023-01-02 01:51:45.2,,No Classification Found,can't predict | insufficient description,100.0,
2,European-Mediterranean Seismological Centre,CENTRAL TURKEY,https://www.emsc-csem.org/Earthquake/earthquak...,2023-01-03 15:59:56.9,,No Classification Found,can't predict | insufficient description,100.0,


In [8]:
# filter row to ingnore the below 3 suppliers
df = df[~df['Supplier_Name'].isin(['Jenoptik', 'Gabrian International (H.K.) Ltd', 'European-Mediterranean Seismological Centre'])]

In [9]:
# confirm that suppliers is filtered out.
print(df[df["Supplier_Name"]=='Jenoptik'].empty)
print(df[df["Supplier_Name"]=='Gabrian International (H.K.) Ltd'].empty)
print(df[df["Supplier_Name"]=='European-Mediterranean Seismological Centre'].empty)

True
True
True


In [10]:
# check if there any null values in news titles column
df['News_Title'].isnull().sum()

6765

In [11]:
# drop null values from news title column
df = df.drop(df[df['News_Title'].isnull()].index)

In [12]:
# Confirm that null rows is deleted.
df['News_Title'].isnull().sum()

0

In [13]:
# check min and max len of text for news_title column
min_max_title  = df['News_Title'].apply(len).min() , df['News_Title'].apply(len).max()
min_max_title

(1, 32198)

In [14]:
# filter data to get only titles greater than 15 char.
df = df[df['News_Title'].str.len() >= 15]

In [15]:
# dupple check min and max len of text for news_title column after filteration. 
min_max_title  = df['News_Title'].apply(len).min() , df['News_Title'].apply(len).max()
min_max_title

(15, 32198)

In [16]:
# ingore any row in trusted_language is already detected as english source.
df = df[df['Trusted_Language'] != 'en']

In [17]:
# reset index
df = df.reset_index()

In [18]:
# check len of dataframe after filtering and preprocessing.
print(len(df))
df.head(3)

283450


Unnamed: 0,index,Supplier_Name,News_Title,News_Link,News_Date,Description,Initial_Classification,Machine_Learning,Confidence,Trusted_Language
0,26018,Finanz Informatik,Das neue FI-Magazin 4/2023 mit den Highlights ...,https://www.f-i.de/News/News-Presse/Aktuelles/...,20.12.2023 Aktuelles,,No Classification Found,can't predict | other languages,100.0,de
1,26019,Finanz Informatik,Das war der Welcome Day im November 2023,https://www.f-i.de/News/News-Presse/Aktuelles/...,30.11.2023 Aktuelles,,No Classification Found,can't predict | other languages,100.0,de
2,26020,Finanz Informatik,FI-Forum 2023 begeisterte die Besucherinnen un...,https://www.f-i.de/News/News-Presse/Aktuelles/...,27.11.2023 Aktuelles,,No Classification Found,can't predict | other languages,100.0,de


In [19]:
# Save the cleaned DataFrame to an Excel file
df.to_excel(r'../data/cleaned data/cleaned_data.xlsx', index=False, engine='openpyxl')

# detect languages and Transle based on news title
#### Load Fasttext model for language detection purposes.

In [20]:

# Model Location    
model_loc = r"..\models\fast_api_model\lid.176.bin"
# Load model
Language_detection_model = fasttext.load_model(model_loc)

#### Function to return detected language and accuracy for detection ber text

In [21]:
# Function to return detected language and confident accuracy using fasttext pre-trained model.
def Detect_language_Accuracy(text , model_loc = model_loc,model = Language_detection_model):
    # Prediction
    model_predict = model.predict(text)
    # Predicted language
    language = model_predict[0][0].split("__")[2]
    # Predicted Accuracy
    accuracy_value = model_predict[1][0] * 100
    # Format Accuracy
    if accuracy_value >= 100:
        accuracy = f"{accuracy_value:.0f}%"
    else:
        accuracy = f"{accuracy_value:.2f}%"
    # Return language and accuracy
    return language, accuracy


#### Function to return translated text based on hugging face model = "Helsinki-NLP/opus-mt-mul-en"

In [22]:
# Load the translation model and tokenizer
model_name = "Helsinki-NLP/opus-mt-mul-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text, model=model, tokenizer=tokenizer):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    translation = model.generate(**tokens)
    translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
    return translated_text

#### Function to return text summarization

In [35]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def Text_Summarization(text,model = summarizer):
    summarized_text = model(text,min_length=30,max_length = 130)[0]['summary_text']
    return summarized_text

#### check flow on single row.

In [29]:
# Single text to check detected language ,accuracy and translated text
text = "株式会社 大真空「統合報告書 2023」発行に関するお知らせ（PDF）"
detected , Accuracy = Detect_language_Accuracy(text)[0] , Detect_language_Accuracy(text)[1] 
translated_text = translate_text(text,model ,tokenizer)
print(f"detected_lang, Accuracy, Translated text: {detected}, {Accuracy}, {translated_text}")

detected_lang, Accuracy, Translated text: ja, 100%, News about the release of the "PDF 2023" (PDF)


In [36]:
Text = '''- Awesome..  But what about the least possible capaci-induc-tances?
  Are there any limits on them towards the Plank length limit?)'''
len(Text)

131

In [37]:
summarized_txt = Text_Summarization(Text)
summarized_txt


Your max_length is set to 130, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


'"What about the least possible capaci-induc-tances? Are there any limits on them towards the Plank length limit?" "I. - Awesome..  But what about the most possible capacI-Induc-Tances?" "Are there any Limits on Plank Lengths?" "Is there any limit on the Planking Length limit?""Is there a cap on the length of the plank?"'

In [38]:
len(summarized_txt)


321

In [32]:
# Read cleaned data
df = pd.read_excel(r'../data/cleaned data/cleaned_data.xlsx')

In [25]:
# Take a sample of 100 rows
sample_df = df.sample(n=100)

In [26]:
sample_df.head(3)

Unnamed: 0,index,Supplier_Name,News_Title,News_Link,News_Date,Description,Initial_Classification,Machine_Learning,Confidence,Trusted_Language
10356,43455,wts electronic components GmbH,PCIM Europa Nürnberg 2023,https://www.wts-electronic.de/news/pcim_europa...,Veröffentlicht am 23.03.2023,,No Classification Found,can't predict | insufficient description,100.0,
216131,500737,eeNews Europe,Digital twin boosts protection of vulnerable r...,https://www.eenewseurope.com/en/digital-twin-b...,24-Oct-23,,No Classification Found,no_events,82.65,
114078,282758,The Hungarian National Association of Radio Di...,Australia - Fire,https://rsoe-edis.org/eventList/details/511761/0,10/29/2023 5:10,,Fires,can't predict | insufficient description,100.0,


In [33]:
# Apply the function to the 'News_Title' column
df['Detected_Language'], df['Accuracy'] = zip(*df['News_Title'].swifter.apply(Detect_language_Accuracy))

Pandas Apply:   0%|          | 0/36 [00:00<?, ?it/s]

In [34]:
# Applying the translation function
df["English Translation"] = df["News_Title"].parallel_apply(translate_text)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [39]:
# Applying the translation function
df["Summarized text"] = df["Description"].parallel_apply(Text_Summarization)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [None]:
sample_df.head(3)

Unnamed: 0,index,Supplier_Name,News_Title,News_Link,News_Date,Description,Initial_Classification,Machine_Learning,Confidence,Trusted_Language,Detected_Language,Accuracy,English Translation
140283,357999,The Hungarian National Association of Radio Di...,Congo DRC - Fire,https://rsoe-edis.org/eventList/details/413710/0,8/5/2023 12:49,,Fires,can't predict | insufficient description,100.0,,es,28.33%,Congo DRC - Fire
260789,658528,Vilros,How To Overclock Raspberry Pi 4,https://vilros.com/blogs/news/how-to-overclock...,13-Sep-21,,No Classification Found,can't predict | insufficient description,100.0,,en,45.45%,How to Overclock Raspberry Pi 4
165042,415178,Aerco,Rationalisation of TE products affect Raychem ...,https://www.aerco.co.uk/rationalisation-of-te-...,,,No Classification Found,no_events,93.46,,en,62.24%,rationalisation of TE products affect Raychem ...


In [37]:
# Save the processed DataFrame
df.to_excel(r'..\data\output\fulldata.xlsx', index=False)

In [30]:
df.head(3)

Unnamed: 0,index,Supplier_Name,News_Title,News_Link,News_Date,Description,Initial_Classification,Machine_Learning,Confidence,Trusted_Language,Detected_Language,Accuracy,English Translation
0,30620,Business Wire,REPLY: Storm Reply lanceert op RAG gebaseerde ...,https://www.businesswire.com/news/home/2023122...,12/21/2023 - 12:36 PM,,No Classification Found,can't predict | other languages,100,nl,nl,88.86%,REPLY: Storm Reply launchs a RAG based AI chat...
1,30621,Business Wire,Samenvatting: Exscientia kondigt uitbreiding a...,https://www.businesswire.com/news/home/2023122...,12/21/2023 - 12:27 PM,,No Classification Found,can't predict | other languages,100,nl,nl,93.43%,Summary: Excientia announces expansion of its ...
2,30622,Business Wire,Samenvatting: Biocytogen en Hansoh Pharma kond...,https://www.businesswire.com/news/home/2023010...,01/03/2023 - 04:21 PM,,No Classification Found,can't predict | other languages,100,nl,nl,99.45%,Summary: Biocytogens and Hansoh Pharma announc...


In [77]:
import pandas as pd
import numpy as np
import fasttext
from transformers import MarianMTModel, MarianTokenizer
import swifter
import warnings
warnings.filterwarnings('ignore')
from fastapi import FastAPI, UploadFile, File
import uvicorn
from pydantic import BaseModel
from typing import List
import asyncio

class LanguageProcessor:
    def __init__(self, data_path, model_loc, translation_model_name):
        self.data_path = data_path
        self.model_loc = model_loc
        self.translation_model_name = translation_model_name
        self.df = None
        self.language_detection_model = None
        self.translation_model = None
        self.tokenizer = None
        self._load_data()
        self._load_models()

    def _load_data(self):
        self.df = pd.read_excel(self.data_path)

    def _load_models(self):
        # Load the language detection model
        self.language_detection_model = fasttext.load_model(self.model_loc)
        # Load the translation model and tokenizer
        self.tokenizer = MarianTokenizer.from_pretrained(self.translation_model_name)
        self.translation_model = MarianMTModel.from_pretrained(self.translation_model_name)
        
    def detect_language_and_accuracy(self, text):
        # Prediction
        model_predict = self.language_detection_model.predict(text)
        # Predicted language
        language = model_predict[0][0].split("__")[2]
        # Predicted Accuracy
        accuracy_value = model_predict[1][0] * 100
        # Format Accuracy
        if accuracy_value >= 100:
            accuracy = f"{accuracy_value:.0f}%"
        else:
            accuracy = f"{accuracy_value:.2f}%"
        # Return language and accuracy
        return language, accuracy

    def translate_text(self, text):
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
        translation = self.translation_model.generate(**tokens)
        translated_text = self.tokenizer.decode(translation[0], skip_special_tokens=True)
        return translated_text

    def process_data(self):
        # Apply the language detection function to the 'News_Title' column using swifter
        self.df['Detected_Language'], self.df['Accuracy'] = zip(*self.df['News_Title'].swifter.apply(self.detect_language_and_accuracy))
        # Apply the translation function to the 'News_Title' column using swifter
        self.df["English Translation"] = self.df["News_Title"].swifter.apply(lambda x: self.translate_text(x))

    def get_dataframe(self):
        return self.df

# Initialize FastAPI app
app = FastAPI()

# Initialize LanguageProcessor instance
data_path = "cleaned_data.xlsx"
model_loc = r"C:\Users\145568\Downloads\Mywork\Analysis\Projects\Machine & Deep Learning\Event rejion translation and sumarrization\lid.176.bin"
translation_model_name = "Helsinki-NLP/opus-mt-mul-en"
processor = LanguageProcessor(data_path, model_loc, translation_model_name)

# Define Pydantic models
class TextItem(BaseModel):
    text: str

class DetectionResponse(BaseModel):
    language: str
    accuracy: str

class TranslationResponse(BaseModel):
    translation: str

# Define routes
@app.post("/detect_language", response_model=DetectionResponse)
async def detect_language(item: TextItem):
    loop = asyncio.get_event_loop()
    language, accuracy = await loop.run_in_executor(None, processor.detect_language_and_accuracy, item.text)
    return DetectionResponse(language=language, accuracy=accuracy)

@app.post("/translate", response_model=TranslationResponse)
async def translate_text(item: TextItem):
    loop = asyncio.get_event_loop()
    translation = await loop.run_in_executor(None, processor.translate_text, item.text)
    return TranslationResponse(translation=translation)

@app.post("/upload_data/")
async def upload_data(file: UploadFile = File(...)):
    # Save the uploaded file
    file_location = f"uploaded_{file.filename}"
    with open(file_location, "wb+") as file_object:
        file_object.write(file.file.read())

    # Update the processor with the new file
    processor.data_path = file_location
    processor._load_data()
    processor.process_data()

    return {"info": "file uploaded successfully", "filename": file.filename}

@app.get("/get_processed_data")
async def get_processed_data():
    df = processor.get_dataframe()
    return df.to_dict(orient='records')

# Run the app with Uvicorn
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
