# 2 Large Dataset Preprocess

In [102]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, words
from nltk.stem.porter import PorterStemmer
import string

stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
directory = 'LargeDataset/full_docs/full_docs/'

document_names = []
document_content = []

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        
        # Read the content of each file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            document_names.append(filename)
            document_content.append(content)

df = pd.DataFrame({
    'DocumentName': document_names,
    'Content': document_content
})


In [103]:
df.head()

Unnamed: 0,DocumentName,Content
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’..."
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS..."


In [104]:
df.shape

(501676, 2)

### Split United Words

In [107]:
def split_words(text):
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

df['PreprocessedContent'] = df['Content'].apply(split_words)

In [108]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,Science & Mathematics Physics The hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""Health Other - Health I have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""Supertramp The Logical Song without Cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""Legal market overview‘ France is shrinking! ’..."
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...","""Health_Concerns: WHAT ARE THE MEDICAL DANGERS..."


In [109]:
df.loc[0,"Content"]

'Science & Mathematics PhysicsThe hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces.  Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show moreFollow 3 answersAnswersRelevanceRatingNewestOldestBest Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 mSource (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 CommentSchmiso, you forgot a 4 in your answer. Your link even says it: L = 

In [110]:
df.loc[0,"PreprocessedContent"]

'Science & Mathematics Physics The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces.  Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show more Follow 3 answers Answers Relevance Rating Newest Oldest Best Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 m Source (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 Comment Schmiso, you forgot a 4 in your answer. Your link even say

In [111]:
df.loc[1,"Content"]

'"Health Other - HealthI have trouble swallowing due to MS, can I crush valium & other meds to be easier to swallowll?Follow 5 answersAnswersRelevanceRatingNewestOldestBest Answer: If you have a problem swallowing, try crushing Valium (or other tablets) between two spoons, and taking them in a teaspoon of your favorite Jelly (raspberry???). The jelly helps the crushed meds slide down ~Anonymous · 10 years ago0 2 CommentAsker\'s ratingAsk your pharmacist if any or all of your meds can be made into syrup form if you have trouble swallowing. Many forms of medication are designed to be swallowed whole and not interferred with. Do not take advice from those people on here who are only guessing at a correct answer. Seek the advice of professionals.Lady spanner · 10 years ago0 0 CommentI\'m pretty sure its not a good idea to crush pills. You should definitely ask your doctor before doing anything like that, it might be dangerous.littleWing · 10 years ago0 0 CommentPlease ask your doctor! This

In [112]:
df.loc[1,"PreprocessedContent"]

'"Health Other - Health I have trouble swallowing due to MS, can I crush valium & other meds to be easier to swallowll?Follow 5 answers Answers Relevance Rating Newest Oldest Best Answer: If you have a problem swallowing, try crushing Valium (or other tablets) between two spoons, and taking them in a teaspoon of your favorite Jelly (raspberry???). The jelly helps the crushed meds slide down ~Anonymous · 10 years ago0 2 Comment Asker\'s rating Ask your pharmacist if any or all of your meds can be made into syrup form if you have trouble swallowing. Many forms of medication are designed to be swallowed whole and not interferred with. Do not take advice from those people on here who are only guessing at a correct answer. Seek the advice of professionals.Lady spanner · 10 years ago0 0 Comment I\'m pretty sure its not a good idea to crush pills. You should definitely ask your doctor before doing anything like that, it might be dangerous.little Wing · 10 years ago0 0 Comment Please ask your 

### Lowercase

In [114]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [115]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science & mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""health other - health i have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""supertramp the logical song without cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""legal market overview‘ france is shrinking! ’..."
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...","""health_concerns: what are the medical dangers..."


 ### Removing URLs

In [117]:
url_pattern = re.compile(r'https?://\S+')

def remove_urls(text):
    return url_pattern.sub('', text)

df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_urls)

In [118]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science & mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""health other - health i have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""supertramp the logical song without cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""legal market overview‘ france is shrinking! ’..."
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...","""health_concerns: what are the medical dangers..."


### Removing remove non-word and non-whitespace characters

In [120]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'[^\w\s]', value=' ', regex=True)

In [121]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'[^a-zA-Z\s]', value=' ', regex=True)

In [122]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health other health i have trouble swallowi...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp the logical song without cofounder...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france is shrinking ...
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...",health concerns what are the medical dangers...


In [123]:
df.loc[0,"PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

### Remove \n

In [125]:
df_with_newline = df[df['PreprocessedContent'].apply(lambda x: '\n' in x if isinstance(x, str) else False)]

In [126]:
df_with_newline.iloc[0]["PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

In [127]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: x.replace('\n', '') if isinstance(x, str) else x)

In [128]:
df.iloc[0]["PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

### Removing digits

In [130]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'\d', value=' ', regex=True)

In [131]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health other health i have trouble swallowi...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp the logical song without cofounder...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france is shrinking ...
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...",health concerns what are the medical dangers...


### Remove Stopwords

In [135]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [136]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_stopwords)

In [137]:
df['PreprocessedContent'].head()

0    science mathematics physics hot glowing surfac...
1    health health trouble swallowing due ms crush ...
2    supertramp logical song without cofounder roge...
3    legal market overview france shrinking laments...
4    health concerns medical dangers marijuana use ...
Name: PreprocessedContent, dtype: object

### Remove Punctuations

In [140]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: ''.join([' ' if char in string.punctuation else char for char in x]))

In [141]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics hot glowing surfac...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health health trouble swallowing due ms crush ...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp logical song without cofounder roge...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france shrinking laments...
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...",health concerns medical dangers marijuana use ...


In [142]:
df.loc[22]

DocumentName                                           output_100016.txt
Content                "Examples of what you can createUse one of the...
PreprocessedContent    examples create use one prebuilt themes speed ...
Name: 22, dtype: object

### Remove Non-English Words

In [145]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\halil\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [146]:
english_words = set(words.words())

In [147]:
def remove_non_english_words(text):
    return ' '.join([word for word in text.split() if word.lower() in english_words])

In [148]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_non_english_words)

### Remove One Letters

In [151]:
def remove_one_letter_words(text):
    return ' '.join([word for word in text.split() if len(word) > 1])

In [152]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_one_letter_words)

In [153]:
df['PreprocessedContent'].head()

0    science mathematics physics hot glowing surfac...
1    health health trouble swallowing due ms crush ...
2    supertramp logical song without cofounder roge...
3    legal market overview france shrinking laments...
4    health concerns medical dangers marijuana use ...
Name: PreprocessedContent, dtype: object

### Remove Multiple Space

In [155]:
df['PreprocessedContent'] = df['PreprocessedContent'].str.replace(r'\s+', ' ', regex=True)

In [156]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics hot glowing surfac...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health health trouble swallowing due ms crush ...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp logical song without cofounder roge...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france shrinking laments...
4,output_10000.txt,"""Health_Concerns: WHAT ARE THE MEDICAL DANGERS...",health concerns medical dangers marijuana use ...


### Remove Empty Rows

In [158]:
df['ContentLength'] = df['PreprocessedContent'].apply(lambda x: len(x) if isinstance(x, str) else 0)

In [159]:
df['ContentLength'].describe()

count    501676.000000
mean       4828.293335
std       10006.506602
min           0.000000
25%        1257.000000
50%        2472.000000
75%        4715.000000
max      909652.000000
Name: ContentLength, dtype: float64

In [160]:
results = pd.read_csv("SmallDataset/dev_query_results_small.csv")

In [161]:
results['doc_number'] = 'output_' + results['doc_number'].astype(str) + '.txt'

In [162]:
results.head()

Unnamed: 0,Query_number,doc_number
0,1089273,output_590.txt
1,1087869,output_1478.txt
2,1087858,output_547.txt
3,1087532,output_595.txt
4,1086886,output_466.txt


In [163]:
filtered_df = df[df['DocumentName'].isin(results['doc_number'])]

In [164]:
filtered_df["ContentLength"].describe()

count      210.000000
mean      8374.171429
std      11399.216597
min        288.000000
25%       2721.000000
50%       4900.000000
75%       9571.000000
max      84768.000000
Name: ContentLength, dtype: float64

- Content length of the question - answer statistics are above.

In [166]:
df[df['ContentLength']<10]

Unnamed: 0,DocumentName,Content,PreprocessedContent,ContentLength
6,output_100001.txt,"""""\n",,0
126,output_10011.txt,"""""\n",,0
231,output_100204.txt,"""""\n",,0
1388,output_101246.txt,"""""\n",,0
1411,output_101267.txt,Loading...\n,loading,7
...,...,...,...,...
500462,output_98906.txt,"""""\n",,0
501151,output_99526.txt,"""""\n",,0
501602,output_99932.txt,"""""\n",,0
501648,output_99974.txt,Settings\n,settings,8


In [167]:
df.drop(df[df['ContentLength']<10].index,inplace=True)
df = df.reset_index(drop=True)

In [168]:
df["PreprocessedContent"].isna().sum()

0

### Remove Space from Start-End of Content 

In [170]:
df['PreprocessedContent'] = df['PreprocessedContent'].str.strip()

In [171]:
df.sample()["PreprocessedContent"].values[0]

'media tour operators add update listing contact us international official site kentucky department tourism airport rd nd floor frankfort ky site map privacy disclaimer accessibility kentucky department travel rights reserved powered aristotle'

In [172]:
df["PreprocessedContent"].isna().sum()

0

### Stemming

In [174]:
def stem_words(text):
    word_tokens = text.split()
    stems = ' '.join([stemmer.stem(word) for word in word_tokens])
    return stems

In [175]:
df["PreprocessedContent"].head().apply(stem_words)

0    scienc mathemat physic hot glow surfac star em...
1    health health troubl swallow due ms crush vali...
2    supertramp logic song without cofound roger ho...
3    legal market overview franc shrink lament one ...
4    health concern medic danger marijuana use must...
Name: PreprocessedContent, dtype: object

In [176]:
df["PreprocessedContent"] = df["PreprocessedContent"].apply(stem_words)

In [237]:
df.to_csv("LargeDataset/PreprocessedContents.csv",index=False)

# For Later...
- Spelling auto correction could be applied to increase performance

In [4]:
from autocorrect import Speller

spell = Speller(lang='en')

print(spell('caaaar'))
print(spell('mussage'))
print(spell('survice'))
print(spell('hte'))

aaaaaa
message
service
the
