# 0 Preprocess

In [65]:
import os
import pandas as pd
import re
import string
import nltk
from nltk.corpus import words
from nltk.stem.porter import PorterStemmer
nltk.download('words')
english_words = set(words.words())
stemmer = PorterStemmer()
directory = 'SmallDataset/full_docs_small'

document_names = []
document_content = []

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        
        # Read the content of each file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            document_names.append(filename)
            document_content.append(content)

df = pd.DataFrame({
    'DocumentName': document_names,
    'Content': document_content
})


In [66]:
df.head()

Unnamed: 0,DocumentName,Content
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’..."
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas..."


### Split United Words

In [68]:
def split_words(text):
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

df['PreprocessedContent'] = df['Content'].apply(split_words)

In [69]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,Science & Mathematics Physics The hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""Health Other - Health I have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""Supertramp The Logical Song without Cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""Legal market overview‘ France is shrinking! ’..."
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...","""Name Argon Symbol Ar Atomic Number 18Atomic M..."


In [70]:
df.loc[0,"Content"]

'Science & Mathematics PhysicsThe hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces.  Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show moreFollow 3 answersAnswersRelevanceRatingNewestOldestBest Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 mSource (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 CommentSchmiso, you forgot a 4 in your answer. Your link even says it: L = 

In [71]:
df.loc[0,"PreprocessedContent"]

'Science & Mathematics Physics The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?It is a good approximation to assume that the emissivity e is equal to 1 for these surfaces.  Find the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical. Use σ =... show more Follow 3 answers Answers Relevance Rating Newest Oldest Best Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) = √ ( 2.7x10+32 W / (1 · 5.67x10-8W/m²K^4 · (1100K)^4 · π) ) = 3.22x10+13 m Source (s):http://en.wikipedia.org/wiki/Stefan_bolt...schmiso · 1 decade ago0 18 Comment Schmiso, you forgot a 4 in your answer. Your link even say

In [72]:
df.loc[1,"Content"]

'"Health Other - HealthI have trouble swallowing due to MS, can I crush valium & other meds to be easier to swallowll?Follow 5 answersAnswersRelevanceRatingNewestOldestBest Answer: If you have a problem swallowing, try crushing Valium (or other tablets) between two spoons, and taking them in a teaspoon of your favorite Jelly (raspberry???). The jelly helps the crushed meds slide down ~Anonymous · 10 years ago0 2 CommentAsker\'s ratingAsk your pharmacist if any or all of your meds can be made into syrup form if you have trouble swallowing. Many forms of medication are designed to be swallowed whole and not interferred with. Do not take advice from those people on here who are only guessing at a correct answer. Seek the advice of professionals.Lady spanner · 10 years ago0 0 CommentI\'m pretty sure its not a good idea to crush pills. You should definitely ask your doctor before doing anything like that, it might be dangerous.littleWing · 10 years ago0 0 CommentPlease ask your doctor! This

In [73]:
df.loc[1,"PreprocessedContent"]

'"Health Other - Health I have trouble swallowing due to MS, can I crush valium & other meds to be easier to swallowll?Follow 5 answers Answers Relevance Rating Newest Oldest Best Answer: If you have a problem swallowing, try crushing Valium (or other tablets) between two spoons, and taking them in a teaspoon of your favorite Jelly (raspberry???). The jelly helps the crushed meds slide down ~Anonymous · 10 years ago0 2 Comment Asker\'s rating Ask your pharmacist if any or all of your meds can be made into syrup form if you have trouble swallowing. Many forms of medication are designed to be swallowed whole and not interferred with. Do not take advice from those people on here who are only guessing at a correct answer. Seek the advice of professionals.Lady spanner · 10 years ago0 0 Comment I\'m pretty sure its not a good idea to crush pills. You should definitely ask your doctor before doing anything like that, it might be dangerous.little Wing · 10 years ago0 0 Comment Please ask your 

### Lowercase

In [74]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [75]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science & mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""health other - health i have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""supertramp the logical song without cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""legal market overview‘ france is shrinking! ’..."
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...","""name argon symbol ar atomic number 18atomic m..."


 ### Removing URLs

In [76]:
url_pattern = re.compile(r'https?://\S+')

def remove_urls(text):
    return url_pattern.sub('', text)

df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_urls)

In [77]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science & mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...","""health other - health i have trouble swallowi..."
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...","""supertramp the logical song without cofounder..."
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...","""legal market overview‘ france is shrinking! ’..."
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...","""name argon symbol ar atomic number 18atomic m..."


### Removing non-word and non-whitespace characters

In [78]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'[^\w\s]', value=' ', regex=True)

In [79]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'[^a-zA-Z\s]', value=' ', regex=True)

In [80]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health other health i have trouble swallowi...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp the logical song without cofounder...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france is shrinking ...
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...",name argon symbol ar atomic number atomic m...


In [81]:
df.loc[0,"PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

### Removing \n

In [82]:
df_with_newline = df[df['PreprocessedContent'].apply(lambda x: '\n' in x if isinstance(x, str) else False)]

In [83]:
df_with_newline.iloc[0]["PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

In [84]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: x.replace('\n', '') if isinstance(x, str) else x)

In [85]:
df.iloc[0]["PreprocessedContent"]

'science   mathematics physics the hot glowing surfaces of stars emit energy in the form of electromagnetic radiation  it is a good approximation to assume that the emissivity e is equal to   for these surfaces   find the radius of the star rigel  the bright blue star in the constellation orion that radiates energy at a rate of     x       w and has a surface temperature of        k  assume that the star is spherical  use        show more follow   answers answers relevance rating newest oldest best answer  stefan boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature  q           t   the total energy flux at a spherical surface of radius r is q   q   r        t     r  hence the radius is r       q        t                 x      w            x    w m k          k                 x      m source  s       decade ago     comment schmiso  you forgot a   in your answer  your link even says it  l    pi  r   sigma  t     using l  luminosit

### Removing digits

In [86]:
df['PreprocessedContent'] = df['PreprocessedContent'].replace(to_replace=r'\d', value=' ', regex=True)

In [87]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics the hot glowing ...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health other health i have trouble swallowi...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp the logical song without cofounder...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france is shrinking ...
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...",name argon symbol ar atomic number atomic m...


### Remove Stopwords

In [88]:
from nltk.corpus import stopwords
import nltk

In [89]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\halil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [90]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [91]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_stopwords)

In [92]:
df['PreprocessedContent'].head()

0    science mathematics physics hot glowing surfac...
1    health health trouble swallowing due ms crush ...
2    supertramp logical song without cofounder roge...
3    legal market overview france shrinking laments...
4    name argon symbol ar atomic number atomic mass...
Name: PreprocessedContent, dtype: object

### Remove Punctuations

In [94]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(lambda x: ''.join([' ' if char in string.punctuation else char for char in x]))

In [95]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics hot glowing surfac...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health health trouble swallowing due ms crush ...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp logical song without cofounder roge...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france shrinking laments...
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...",name argon symbol ar atomic number atomic mass...


In [96]:
df.loc[22]

DocumentName                                             output_1018.txt
Content                "palpitation, fatigue, chills!Karen__0__0Hi......
PreprocessedContent    palpitation fatigue chills karen hi asthma cur...
Name: 22, dtype: object

### Remove One Letters

In [103]:
def remove_one_letter_words(text):
    return ' '.join([word for word in text.split() if len(word) > 1])

In [104]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_one_letter_words)

In [105]:
df['PreprocessedContent'].head()

0    science mathematics physics hot glowing surfac...
1    health health trouble swallowing due ms crush ...
2    supertramp logical song without cofounder roge...
3    legal market overview france shrinking laments...
4    name argon symbol ar atomic number atomic mass...
Name: PreprocessedContent, dtype: object

### Remove Multiple Space

In [106]:
df['PreprocessedContent'] = df['PreprocessedContent'].str.replace(r'\s+', ' ', regex=True)

In [107]:
df.head()

Unnamed: 0,DocumentName,Content,PreprocessedContent
0,output_1.txt,Science & Mathematics PhysicsThe hot glowing s...,science mathematics physics hot glowing surfac...
1,output_10.txt,"""Health Other - HealthI have trouble swallowin...",health health trouble swallowing due ms crush ...
2,output_100.txt,"""Supertramp The Logical Song without Cofounder...",supertramp logical song without cofounder roge...
3,output_1000.txt,"""Legal market overview‘ France is shrinking! ’...",legal market overview france shrinking laments...
4,output_1001.txt,"""Name ArgonSymbol ArAtomic Number 18Atomic Mas...",name argon symbol ar atomic number atomic mass...


### Remove Empty Rows

In [108]:
df['ContentLength'] = df['PreprocessedContent'].apply(lambda x: len(x) if isinstance(x, str) else 0)

In [109]:
df['ContentLength'].describe()

count      1557.000000
mean       4710.615286
std        8403.334118
min           0.000000
25%        1264.000000
50%        2542.000000
75%        4781.000000
max      101516.000000
Name: ContentLength, dtype: float64

In [110]:
results = pd.read_csv("SmallDataset/dev_query_results_small.csv")

In [111]:
results['doc_number'] = 'output_' + results['doc_number'].astype(str) + '.txt'

In [112]:
results.head()

Unnamed: 0,Query_number,doc_number
0,1089273,output_590.txt
1,1087869,output_1478.txt
2,1087858,output_547.txt
3,1087532,output_595.txt
4,1086886,output_466.txt


In [113]:
filtered_df = df[df['DocumentName'].isin(results['doc_number'])]

In [114]:
filtered_df["ContentLength"].describe()

count      210.000000
mean      8374.171429
std      11399.216597
min        288.000000
25%       2721.000000
50%       4900.000000
75%       9571.000000
max      84768.000000
Name: ContentLength, dtype: float64

- Content length of the question - answer statistics are above.

In [115]:
df[df['ContentLength']<10]

Unnamed: 0,DocumentName,Content,PreprocessedContent,ContentLength
17,output_1013.txt,"""""\n",,0
157,output_114.txt,"""""\n",,0
400,output_1359.txt,"""""\n",,0
444,output_1399.txt,"""""\n",,0
616,output_1553.txt,"""""\n",,0
620,output_1557.txt,,,0
710,output_236.txt,"""""\n",,0
917,output_422.txt,"""""\n",,0
921,output_426.txt,"""""\n",,0
1171,output_651.txt,Settings\n,settings,8


In [116]:
df.drop(df[df['ContentLength']<10].index,inplace=True)
df = df.reset_index(drop=True)

In [117]:
df["PreprocessedContent"].isna().sum()

np.int64(0)

### Remove Space from Start-End of Document 

In [118]:
df['PreprocessedContent'] = df['PreprocessedContent'].str.strip()

In [119]:
df.sample()["PreprocessedContent"].values[0]

'engineering com updated forum post question please visit new ask forum database questions library remain available extended period armanhow stop pole barn poles pushing ground view poles poured floor dirt driveway front poles pushing ground years ago months left answer response report abuse respond question share responses niel post pushing ground result phenomena know frost heave two links information prevent usually requires ensuring sufficient pole length frost line niel leon engineering com years ago source engineering com provide engineering advice ask service forum members exchange ideas relating world engineering caution users accept responses receive without validation rely engineering advice may get members ask forum engineering com specifically disclaims obligation validate verify information posted within ask service engineering com encourages users seek services professional engineer engineering advice may require'

In [120]:
df["PreprocessedContent"].isna().sum()

np.int64(0)

### Stemming

In [121]:
def stem_words(text):
    word_tokens = text.split()
    stems = ' '.join([stemmer.stem(word) for word in word_tokens])
    return stems

In [122]:
df["PreprocessedContent"].head().apply(stem_words)

0    scienc mathemat physic hot glow surfac star em...
1    health health troubl swallow due ms crush vali...
2    supertramp logic song without cofound roger ho...
3    legal market overview franc shrink lament one ...
4    name argon symbol ar atom number atom mass ato...
Name: PreprocessedContent, dtype: object

In [123]:
df["PreprocessedContent"] = df["PreprocessedContent"].apply(stem_words)

### Remove Non-English Words

In [100]:
def remove_non_english_words(text):
    return ' '.join([word for word in text.split() if word.lower() in english_words])

In [101]:
df['PreprocessedContent'] = df['PreprocessedContent'].apply(remove_non_english_words)

In [124]:
df.to_csv("SmallDataset/PreprocessedContents.csv",index=False)