### 1. Importing Libraries & Utilities

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import pickle

### 2. Exploring Dataset

In [2]:
df = pd.read_parquet('/content/drive/MyDrive/Python Development Assignment/stack_overflow_tech_final.parquet')

In [3]:
df.head()

Unnamed: 0,title,question,answer
0,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,"Another example:\nfor key, igroup in itertools..."
1,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,I would like to give another example where gro...
2,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,WARNING:\nThe syntax list(groupby(...)) won't ...
3,Adding a Method to an Existing Object Instance,I've read that it is possible to add a method ...,I think that the above answers missed the key ...
4,Adding a Method to an Existing Object Instance,I've read that it is possible to add a method ...,There are at least two ways for attach a metho...


In [4]:
df.shape

(3296, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3296 entries, 0 to 3295
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     3296 non-null   object
 1   question  3296 non-null   object
 2   answer    3296 non-null   object
dtypes: object(3)
memory usage: 77.4+ KB


In [6]:
df.isnull().sum()

Unnamed: 0,0
title,0
question,0
answer,0


In [7]:
df_copy = df.copy()

### 3. Text Cleaning

Normalising Tech Words

In [8]:
tech_words = [
    # Languages & frameworks
    (re.compile(r'\bC\+\+\b', re.I), 'cpp'),
    (re.compile(r'\bC#\b', re.I), 'csharp'),
    (re.compile(r'\bF#\b', re.I), 'fsharp'),
    (re.compile(r'\b\.NET\b', re.I), 'dotnet'),
    (re.compile(r'\bnode\.js\b', re.I), 'nodejs'),
    (re.compile(r'\breact[-_.]native\b', re.I), 'reactnative'),
    (re.compile(r'\b(express|next|vue|react)\.js\b', re.I), r'\1js'),
    (re.compile(r'\btailwind\.css\b', re.I), 'tailwindcss'),

    # ML & Data
    (re.compile(r'\bscikit[-_ ]learn\b', re.I), 'sklearn'),
    (re.compile(r'\bTensor\s*Flow\b', re.I), 'tensorflow'),
    (re.compile(r'\bOpenCV\b', re.I), 'opencv'),
    (re.compile(r'\bSciPy\b', re.I), 'scipy'),

    # Cloud / web
    (re.compile(r'\bAWS\b', re.I), 'aws'),
    (re.compile(r'\bGCP\b', re.I), 'gcp'),
    (re.compile(r'\bAzure\b', re.I), 'azure'),
    (re.compile(r'\bDocker\b', re.I), 'docker'),
    (re.compile(r'\bKubernetes\b', re.I), 'kubernetes'),
    (re.compile(r'\bPostgreSQL\b', re.I), 'postgresql'),
    (re.compile(r'\bMySQL\b', re.I), 'mysql'),
    (re.compile(r'\bSQLite\b', re.I), 'sqlite'),

    # AI models
    (re.compile(r'\bChatGPT\b', re.I), 'chatgpt'),
    (re.compile(r'\bGPT[-_ ]?4\b', re.I), 'gpt4'),
    (re.compile(r'\bGPT[-_ ]?3\b', re.I), 'gpt3'),
    (re.compile(r'\bLLaMA\b', re.I), 'llama'),
    (re.compile(r'\bGemini\b', re.I), 'gemini'),
    (re.compile(r'\bClaude\b', re.I), 'claude'),

    # Common libraries
    (re.compile(r'\bBeautiful\s*Soup\b', re.I), 'beautifulsoup'),
    (re.compile(r'\bFastAPI\b', re.I), 'fastapi'),
    (re.compile(r'\bSQLAlchemy\b', re.I), 'sqlalchemy'),
    (re.compile(r'\bRedis\b', re.I), 'redis'),
]

In [9]:
def replace_tech_words(text):
    if not isinstance(text, str):
        text = str(text)

    for pattern, replacement in tech_words:
        text = pattern.sub(replacement, text)
    return text

In [10]:
df_copy['answer'] = df_copy['answer'].apply(replace_tech_words)

Removing Links

In [11]:
def remove_links(txt: str) -> str:
  return re.sub(r'http\S+|www\S+', '', txt)

In [12]:
df_copy['answer'] = df_copy['answer'].apply(remove_links)

Removing HTML Tags

In [13]:
def striphtml(txt):
  HTML_TAG = re.compile(r"<[^>]+>")
  return HTML_TAG.sub(' ', txt)

In [14]:
df_copy['answer'] = df_copy['answer'].apply(striphtml)

Remove Emojis

In [15]:
def remove_emojis(txt: str) -> str:
    if not isinstance(txt, str):
        txt = str(txt)
    return ''.join(ch for ch in txt if ch.isascii())

In [16]:
df_copy['answer'] = df_copy['answer'].apply(remove_emojis)

Removing Numbers

In [17]:
def remove_num(txt):
    words = txt.split()
    return " ".join(word for word in words if not any(ch.isdigit() for ch in word))

In [18]:
df_copy['answer'] = df_copy['answer'].apply(remove_num)

Remove Punctuation

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
def remove_punc(txt):
    if not isinstance(txt, str):
        txt = str(txt)
    return txt.translate(str.maketrans('', '', string.punctuation))

In [21]:
df_copy['answer'] = df_copy['answer'].apply(remove_punc)

Lowercasing

In [22]:
df_copy['answer'] = df_copy['answer'].apply(lambda x : x.lower())

Removing Stopwords

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
def remove_stopwords(txt):
    if not isinstance(txt, str):
        txt = str(txt)

    words = word_tokenize(txt)
    filtered = [w for w in words if w.isalpha() and w.lower() not in stop_words]
    return " ".join(filtered)

In [25]:
df_copy['answer'] = df_copy['answer'].apply(remove_stopwords)

Removing Extra Spaces

In [26]:
df_copy['answer'] = df_copy['answer'].apply(lambda x: " ".join(str(x).split()))

In [27]:
df_copy['answer']

Unnamed: 0,answer
0,another example key igroup lambda x x print ke...
1,would like give another example groupby withou...
2,warning syntax listgroupby wont work way inten...
3,think answers missed key point lets class meth...
4,least two ways attach method instance without ...
...,...
3291,want indentation terms nesting level rather sp...
3292,solve real problem lead question could impleme...
3293,repr str python put many digits required make ...
3294,simple answer due quantization roundoff error ...


### 4. Post-Cleaning Analysis

Empty Rate

In [28]:
empty = (df_copy['answer'].str.len() == 0).mean()
print(f"Empty ratio: {empty:.2%}")

Empty ratio: 0.00%


Avg Tokens/Row

In [29]:
avg_len = df_copy['answer'].str.split().str.len().mean()
print(f"Avg tokens per row: {avg_len:.1f}")

Avg tokens per row: 63.3


Top Tokens

In [30]:
c = Counter(" ".join(df_copy['answer']).split())
print(c.most_common(25))

[('python', 3072), ('import', 2119), ('use', 1899), ('def', 1785), ('return', 1521), ('print', 1305), ('class', 1279), ('x', 1213), ('code', 1111), ('function', 1055), ('file', 1053), ('using', 1045), ('list', 995), ('like', 888), ('install', 862), ('one', 861), ('also', 855), ('object', 853), ('example', 831), ('b', 810), ('want', 773), ('c', 764), ('value', 755), ('name', 745), ('pip', 705)]


Corpus Word Size

In [31]:
unique_words = len(c)
print(f"Total Words: {unique_words}")

Total Words: 24469


Longest and Shortest Answers

In [32]:
df_copy['answer_length'] = df_copy['answer'].str.split().str.len()
print(df_copy['answer_length'].describe())

count    3296.000000
mean       63.329187
std        90.719906
min         1.000000
25%        18.000000
50%        35.000000
75%        72.000000
max      1296.000000
Name: answer_length, dtype: float64


Merging In Original Dataset

In [33]:
df['answer_length'] = df_copy['answer_length']

In [34]:
df['clean_answer'] = df_copy['answer']

In [35]:
df.head()

Unnamed: 0,title,question,answer,answer_length,clean_answer
0,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,"Another example:\nfor key, igroup in itertools...",74,another example key igroup lambda x x print ke...
1,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,I would like to give another example where gro...,61,would like give another example groupby withou...
2,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,WARNING:\nThe syntax list(groupby(...)) won't ...,45,warning syntax listgroupby wont work way inten...
3,Adding a Method to an Existing Object Instance,I've read that it is possible to add a method ...,I think that the above answers missed the key ...,140,think answers missed key point lets class meth...
4,Adding a Method to an Existing Object Instance,I've read that it is possible to add a method ...,There are at least two ways for attach a metho...,53,least two ways attach method instance without ...


### 5. Word Embedding Training (Word2Vec)

In [36]:
# Word Tokenisation
tokenised_words = [row.split() for row in df['clean_answer']]

In [37]:
from gensim.models import Word2Vec

In [38]:
model = Word2Vec(
    sentences=tokenised_words,
        vector_size=200,
        window=5,
        min_count=5,
        workers=4,
        sg=1,
        epochs=10
)

Checking Vocabulary & Similarity

In [39]:
print(f"Vocabulary size: {len(model.wv)}")

Vocabulary size: 4492


In [40]:
print(f"Top 10 Most Frequent Words")
for word in list(model.wv.index_to_key)[:20]:
    print(f"{word}: {model.wv.get_vecattr(word, 'count')}")

Top 10 Most Frequent Words
python: 3072
import: 2119
use: 1899
def: 1785
return: 1521
print: 1305
class: 1279
x: 1213
code: 1111
function: 1055
file: 1053
using: 1045
list: 995
like: 888
install: 862
one: 861
also: 855
object: 853
example: 831
b: 810


In [41]:
print(model.wv.most_similar("http", topn=10))

[('interact', 0.9098081588745117), ('trust', 0.9074766039848328), ('restful', 0.8908424973487854), ('recreating', 0.8818897604942322), ('mongodb', 0.8809951543807983), ('thirdparty', 0.8738827109336853), ('applications', 0.8718796968460083), ('djangos', 0.8701846599578857), ('migrations', 0.8681976199150085), ('web', 0.8650340437889099)]


Saving Model

In [42]:
with open('word2vec_model.pkl', 'wb') as f:
  pickle.dump(model,f)