In [2]:
import pandas as pd
import contractions
import re
import nltk
from nltk.tokenize import ToktokTokenizer
import spacy

In [2]:
def preprocess(series):
    series = series.apply(lambda x: str(x).lower())
    
    def remove_contractions(row):
        return contractions.fix(row)
    series = series.apply(lambda x: remove_contractions(x))
    
    series = series.str.replace(r'[^\w\s]', '', regex=True)
    
    series = series.str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
    
    def remove_numbers(text):
        pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
        return re.sub(pattern, '', text)
    series = series.apply(lambda x: remove_numbers(x))
    
    nlp = spacy.load('en_core_web_sm')
    def get_lem(text):
        text = nlp(text)
        text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
        return text
    series = series.apply(lambda x: get_lem(x))
    
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    stopword_list.remove('not')
    def remove_stopwords(text):
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        t = [token for token in tokens if token.lower() not in stopword_list]
        text = ' '.join(t)    
        return text
    series = series.apply(lambda x: remove_stopwords(x))
    return series

### IndiaToday 

In [3]:
df = pd.read_excel("IndiaToday.xlsx")

In [4]:
def remove_edited(row):
    index_of_edited_by = row.find("Edited By: ")

    if index_of_edited_by != -1:
        modified_text = row[:index_of_edited_by]
        return modified_text
    else:
        return row
df.Body = df.Body.apply(lambda x: remove_edited(x))

In [5]:
df = df[~df['Body'].apply(lambda x: isinstance(x, (float, int)))]

In [6]:
df = df[~df['Heading'].str.contains('horoscope', case=False)]

In [7]:
df.Body = preprocess(df.Body)

In [8]:
df = df.dropna()

In [9]:
# file_name = "IndiaToday_Preprocessed.xlsx"
# df.to_excel(file_name, index=False)

### IndiaTV 

In [10]:
df2 = pd.read_excel("IndiaTv.xlsx")

In [11]:
df2 = df2[~df2['Body'].apply(lambda x: isinstance(x, (float, int)))]

In [12]:
df2 = df2.loc[~(df2.Heading.str.contains("Aaj Ki Baat") | df2.Heading.str.contains("Horoscope")\
                | df2.Heading.str.contains("Aap Ki Adalat"))]

In [13]:
df2 = df2[~df2['Heading'].str.contains('horoscope', case=False)]

In [14]:
df2.Body = preprocess(df2.Body)

In [15]:
df2 = df2.dropna()

In [16]:
# file_name = "IndiaTV_Preprocessed.xlsx"
# df2.to_excel(file_name, index=False)

### News18 

In [17]:
df3 = pd.read_excel("News18.xlsx")

In [18]:
df3 = df3[~df3['Body'].apply(lambda x: isinstance(x, (float, int)))]

In [19]:
df3 = df3[~df3['Heading'].str.contains('horoscope', case=False)]

In [20]:
df3.Body = preprocess(df3.Body)

In [21]:
df3 = df3.dropna()

In [22]:
# file_name = "News18_Preprocessed.xlsx"
# df3.to_excel(file_name, index=False)

### ThePrint 

In [23]:
df4 = pd.read_excel("ThePrint.xlsx")

In [24]:
df4 = df4[~df4['Body'].apply(lambda x: isinstance(x, (float, int)))]

In [25]:
df4 = df4[~(df4['Body'].str.contains('dear subscriber', case=False))]

In [26]:
df4 = df4[~df4['Heading'].str.contains('horoscope', case=False)]

In [27]:
df4.Body = preprocess(df4.Body)

In [28]:
df4 = df4.dropna()

In [29]:
# file_name = "ThePrint_Preprocessed.xlsx"
# df4.to_excel(file_name, index=False)

### Merge Datasets

In [30]:
df5 = pd.concat([df, df2, df3, df4], ignore_index=True, axis=0)

In [42]:
df5.Heading.dropna(inplace=True)

In [45]:
df5.Body.isna().sum()

0

In [48]:
import numpy as np

In [53]:
df5 = df5[~(df5.Body == "")]

In [54]:
file_name = "Final_Prepped_Data.xlsx"
df5.to_excel(file_name, index=False)

### News18_Punjab

In [44]:
df6 = pd.read_excel("News18_Punjab.xlsx")

In [47]:
df6.iloc[10, 1]

"Apple ਦੀ ਨਵੀਂ ਆਈਫੋਨ 15 ਸੀਰੀਜ਼ 'ਚ ਨੇ ਐਂਟਰੀ ਕਰ ਲਈ ਹੈ ਅਤੇ ਇਸ ਸੀਰੀਜ਼ 'ਚ ਕੰਪਨੀ ਨੇ ਚਾਰ ਨਵੇਂ ਮਾਡਲ iPhone 15, iPhone 15 Plus, iPhone 15 Pro ਅਤੇ iPhone 15 Pro Max ਨੂੰ ਸ਼ਾਮਲ ਕੀਤਾ ਹੈ। ਨਵੇਂ ਫੋਨ ਦੀ ਪ੍ਰੀ-ਬੁਕਿੰਗ 15 ਸਤੰਬਰ ਤੋਂ ਸ਼ੁਰੂ ਹੋ ਗਈ ਹੈ ਅਤੇ ਡਿਲੀਵਰੀ 22 ਸਤੰਬਰ ਤੋਂ ਸ਼ੁਰੂ ਹੋਵੇਗੀ। ਨਵੇਂ ਆਈਫੋਨ ਦੇ ਆਉਣ ਤੋਂ ਬਾਅਦ, ਪੁਰਾਣੇ ਆਈਫੋਨ ਦੀ ਕੀਮਤ ਘੱਟ ਹੋਣ 'ਤੇ ਕੁਝ ਲੋਕ ਉਸ ਮਾਡਲ ਨੂੰ ਖਰੀਦਣ ਦਾ ਪਲਾਨ ਬਣਾਉਂਦੇ ਹਨ। ਐਪਲ ਨੇ ਆਪਣੇ ਆਈਫੋਨ 14, 14 ਪ੍ਰੋ, ਆਈਫੋਨ 13 ਦੀ ਕੀਮਤ ਵਿੱਚ ਕਟੌਤੀ ਕਰ ਦਿੱਤੀ ਹੈਅਜਿਹੇ 'ਚ ਸਵਾਲ ਇਹ ਬਣਦਾ ਹੈ ਕਿ ਕੀ ਆਈਫੋਨ 15 ਦੇ ਆਉਣ ਤੋਂ ਬਾਅਦ ਆਈਫੋਨ 13 ਖਰੀਦਣਾ ਇੱਕ ਫਾਇਦੇ ਦਾ ਸੌਦਾ ਰਹੇਗਾ ਜਾਂ 2 ਸਾਲ ਪੁਰਾਣੇ ਮਾਡਲ ਨੂੰ ਖਰੀਦਣਾ ਮੂਰਖਤਾ ਹੈ। ਇਸ ਨੂੰ ਸਮਝਣ ਲਈ ਜ਼ਰੂਰੀ ਹੈ ਕਿ ਇਨ੍ਹਾਂ ਦੋਵਾਂ ਫੋਨਾਂ ਦੇ ਫੀਚਰਸ 'ਚ ਫਰਕ ਦੇਖ ਲਿਆ ਜਾਵੇ।ਸਭ ਤੋਂ ਪਹਿਲਾਂ, ਕੀਮਤ ਦੀ ਗੱਲ ਕਰੀਏ ਤਾਂ ਆਈਫੋਨ 15 ਦੀ ਸ਼ੁਰੂਆਤੀ ਕੀਮਤ 79,900 ਰੁਪਏ ਹੈ। ਜਦੋਂ ਕਿ iPhone 13 ਨੂੰ 59,900 ਰੁਪਏ ਦੀ ਸ਼ੁਰੂਆਤੀ ਕੀਮਤ 'ਤੇ ਘਰ ਲਿਆਂਦਾ ਜਾ ਸਕਦਾ ਹੈ।Display: ਆਈਫੋਨ 15 ਵਿੱਚ 6.1 ਇੰਚ ਦੀ ਸੁਪਰ ਰੈਟੀਨਾ XDR ਡਿਸਪਲੇ ਦਿੱਤੀ ਗਈ ਹੈ। ਆਈਫੋਨ 13 ਵਿੱਚ ਵੀ 6.1 ਇੰਚ ਦੀ ਸੁਪਰ ਰੇਟੀਨਾ ਐਕਸਡੀਆਰ ਡਿਸਪਲੇਅ ਹੈ। iPhone 15 ਵਿੱਚ ਐਲੂਮੀਨੀਅਮ ਦੇ ਨਾਲ ਕਲਰ ਇਨਫਿਊਜ਼ਡ ਗਲਾਸ ਬੈਕ ਮਿਲਦਾ

In [4]:
def preprocess_punjabi(series):
    series = series.str.replace(r'[^\w\s]', '', regex=True)
    def remove_nums(row):
        devanagari_nums = ('०','१','२','३','४','५','६','७','८','९')
        for c, n in enumerate(devanagari_nums):
            article = re.sub(n, str(c), row)
        return article
    series = series.apply(lambda x: remove_nums(x))
    series = series.str.replace(r'\d+', '', regex=True)
    return series

In [5]:
df6.Body = preprocess_punjabi(df6.Body)

In [42]:
df6.dropna(inplace=True)

In [43]:
df6 = [~(df6.Body == "")]

### AajTak_Hindi

In [10]:
df7 = pd.read_excel("AajTak.xlsx")

In [37]:
def preprocess_hindi(series):
    series = series.str.replace(r'[^\w\s]', '', regex=True)
    series = series.str.replace("\n", '')
    series = series.str.replace("\xa0", '')
    def remove_nums(row):
        devanagari_nums = ('०','१','२','३','४','५','६','७','८','९')
        for c, n in enumerate(devanagari_nums):
            article = re.sub(n, str(c), row)
        return article
    
    stopwords_hi = ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर', 'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर','करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे', 'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने', 'पर', 'पर', 'पहले', 'पूरा', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वर्ग', 'वह', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने', 'अपनि', 'जेसे', 'होति', 'सभि', 'तिंहों', 'इंहों', 'दवारा', 'इसि', 'किंहें', 'थि', 'उंहों', 'ओर', 'जिंहें', 'वहिं', 'अभि', 'बनि', 'हि', 'उंहिं', 'उंहें', 'हें', 'वगेरह', 'एसे', 'रवासा', 'कोन', 'निचे', 'काफि', 'उसि', 'पुरा', 'भितर', 'हे', 'बहि', 'वहां', 'कोइ', 'यहां', 'जिंहों', 'तिंहें', 'किसि', 'कइ', 'यहि', 'इंहिं', 'जिधर', 'इंहें', 'अदि', 'इतयादि', 'हुइ', 'कोनसा', 'इसकि', 'दुसरे', 'जहां', 'अप', 'किंहों', 'उनकि', 'भि', 'वरग', 'हुअ', 'जेसा', 'नहिं']
    stopwords_en = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    punctuations = ['nn', 'n', '।', '/', '`', '+', '\\', '"', '?', '▁(', '$', '@', '[', '_', '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', ';', '-','}']
    to_be_removed = stopwords_hi + punctuations + stopwords_en
    def remove_stopwords_and_punctuation(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in to_be_removed]
        return ' '.join(cleaned_words)

    series = series.apply(remove_stopwords_and_punctuation) 
    series = series.apply(lambda x: remove_nums(str(x)))
    series = series.str.replace(r'\d+', '', regex=True)
    return series

In [38]:
df7.Body = preprocess_hindi(df7.Body)

In [None]:
def preprocess_hin_pun(series):
    series = series.str.replace("\n", '')
    series = series.str.replace("\xa0", '')
    return series
df["Body"] = preprocess_hin_pun(df["Body"])