## DS Apprentice Task

### Setting up the Environment

In [1]:
import os
import sys
import csv
import numpy as np
import pandas as pd

# download stopwords
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 0. CSV file cleaning

In [2]:
data_dir = os.path.abspath(os.path.join("..", "data"))

input_path = os.path.join(data_dir, "gig_docs.csv")
output_path = os.path.join(data_dir, "cleaned_gig_docs.csv")

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", newline='', encoding="utf-8") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)

    header = next(reader)
    writer.writerow(header)

    for row in reader:
        # row = [cell.strip() for cell in row]

        temp_row = row[:3]

        # Handle Answer (merge all remaining columns)
        answer = ", ".join(row[3:])  # Join remaining columns with a comma

        temp_row.append(answer)
        writer.writerow(temp_row)


### 1. Data Exploration

In [3]:
# Load the cleaned gig_docs CSV file into a Pandas DataFrame
df = pd.read_csv(os.path.join(data_dir, "cleaned_gig_docs.csv"))

# If Doc_ID is unique, set it as the index
if df['Doc_ID'].is_unique:
    df.set_index('Doc_ID', inplace=True)

df


Unnamed: 0_level_0,Topic,Question_Example,Answer_Snippet
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,GiG Broker,What is GiG Broker?,GiG Broker offers a list of data streams consi...
13,GiG Broker,How is data consistency handled in GiG Broker?,"GiG Broker guarantees eventual consistency, w..."
14,GiG Broker,What are enriched data streams?,Enriched data streams from Core Data are pushe...
29,GiG Broker,How can consumers access data from GiG Broker?,Consumers authenticate to the service and can ...
38,GiG Broker,What is the purpose of the CRM Payload in GiG ...,The CRM Payload is a stream tailored for CRM t...
...,...,...,...
56,SQL Connectivity,What options are available for users to connec...,Authorised users can connect using several opt...
57,SQL Connectivity,What is dim_customer in the SQL Connectivity l...,dim_customer is a dimension table that stores ...
58,SQL Connectivity,What kind of information is in fact_bonus in t...,fact_bonus stores bonus information including ...
66,SQL Connectivity,What is 'dim_game_detail' in SQL Connectivity?,This dimension stores detailed game informatio...


In [4]:
# Check for the number of documents per topic
topic_counts = df['Topic'].value_counts()

print("Number of documents per topic:")
for topic, document_count in topic_counts.items():
   print(f"{topic}: {document_count}")

Number of documents per topic:
GiG Data Segmentation: 22
SQL Connectivity: 13
GiG Broker: 10
KPI Definitions: 9
GRE: 8
Market Packages: 8


In [5]:
# Check for missing values in each column
missing_values = df.isnull().sum()

print("\nMissing values in each column:")
for column, count in missing_values.items():
    print(f"{column}: {count} missing values")

# TODO: Handle missing values if necessary


Missing values in each column:
Topic: 0 missing values
Question_Example: 0 missing values
Answer_Snippet: 0 missing values


### 2. Data Preparation

In [6]:
import re
import string
import contractions
from unidecode import unidecode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
abbreviation_map = {
    "gig": "gaming innovation group", 
    "sql": "structured query language",
    "kpi": "key performance indicator",
    "kpis": "key performance indicators",
    "gre": "game recommendation engine",
    "rtp": "return to player",
    "atpu": "average time per user",
}

def prepare_text(text, remove_numbers=False, remove_stopwords=True, apply_lemmantization=True, remove_contractions=True, remove_abbreviations=True):
    """Prepares a string of text for search/matching"""
    
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")

    # Clean the text (convert to lowercase, remove accents, punctuation, and special characters)
    cleaned_text = clean_text(text, remove_numbers=remove_numbers)

    # Optional: Expand abbreviations (e.g., "gig" → "gaming innovation group")
    if remove_abbreviations:
        pattern = r'\b(' + '|'.join(re.escape(key) for key in abbreviation_map.keys()) + r')\b'
        cleaned_text = re.sub(pattern, lambda x: abbreviation_map[x.group()], cleaned_text)

    # Optional: Expand contractions (e.g., "don't" → "do not", "it's" → "it is")
    if remove_contractions:
        cleaned_text = contractions.fix(cleaned_text) 

    # Split the processed text into tokens
    tokenized_text = word_tokenize(cleaned_text)

    # Optional: Remove Stopwords (e.g., and, or, but, etc.) from tokens 
    if remove_stopwords:
        tokenized_text = [word for word in tokenized_text if word not in stop_words]

    # Optional: Lemmatize the tokens (e.g., running → run, better → good)
    if apply_lemmantization:
        tokenized_text = lemmantize_tokens(tokenized_text)

    return tokenized_text

def clean_text(text, remove_numbers=False):
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")

    # Normalize unicode characters (e.g., café → cafe)
    text = unidecode(text)

    # Convert to lowercase
    text = text.lower()

    # Create a set of characters to remove
    chars_to_remove = set()

    # Add punctuation characters to the set
    chars_to_remove = chars_to_remove.union(set(string.punctuation))

    # Add whitespace characters (excluding a single space) to the set
    chars_to_remove = chars_to_remove.union(set(string.whitespace) - {' '})

    # Optional: Add numbers to the set
    chars_to_remove = chars_to_remove.union(set(string.digits)) if remove_numbers else chars_to_remove

    # Remove the set of characters from the text
    translation_table = str.maketrans('', '', ''.join(chars_to_remove))
    text = text.translate(translation_table)

    # Strip leading and trailing white spaces
    text = text.strip()

    return text

def get_wordnet_pos(tag):
    """ Converts NLTK POS tags to WordNet POS tags. """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

def lemmantize_tokens(tokens):
    """Lemmatizes a list of tokens using WordNet lemmatization."""
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]


# Example usage
text = " Text cleaning? This is a test for GiG...!\n Test works well. 123, & 456.78 and \tcafé."
cleaned_text = prepare_text(text)
print(f"Original text: {text}\n")
print(f"Cleaned text: {cleaned_text}")



Original text:  Text cleaning? This is a test for GiG...!
 Test works well. 123, & 456.78 and 	café.

Cleaned text: ['text', 'clean', 'test', 'game', 'innovation', 'group', 'test', 'work', 'well', '123', '45678', 'cafe']


In [7]:
from collections import Counter
 
# Apply the normalization function to the 'Question' and 'Answer' columns
df['Question_Tokenized'] = df['Question_Example'].apply(prepare_text)
df['Answer_Tokenized'] = df['Answer_Snippet'].apply(prepare_text)

# Add a new column 'Combined' with the cleaned text from both 'Question' and 'Answer'
df['Combined_Tokenized'] = df['Question_Tokenized'] + df['Answer_Tokenized']

df['Combined_Token_Counts'] = df['Combined_Tokenized'].apply(Counter)

df.head()

Unnamed: 0_level_0,Topic,Question_Example,Answer_Snippet,Question_Tokenized,Answer_Tokenized,Combined_Tokenized,Combined_Token_Counts
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12,GiG Broker,What is GiG Broker?,GiG Broker offers a list of data streams consi...,"[game, innovation, group, broker]","[game, innovation, group, broker, offer, list,...","[game, innovation, group, broker, game, innova...","{'game': 2, 'innovation': 2, 'group': 2, 'brok..."
13,GiG Broker,How is data consistency handled in GiG Broker?,"GiG Broker guarantees eventual consistency, w...","[data, consistency, handle, game, innovation, ...","[game, innovation, group, broker, guarantee, e...","[data, consistency, handle, game, innovation, ...","{'data': 4, 'consistency': 2, 'handle': 1, 'ga..."
14,GiG Broker,What are enriched data streams?,Enriched data streams from Core Data are pushe...,"[enrich, data, stream]","[enrich, data, stream, core, data, push, respe...","[enrich, data, stream, enrich, data, stream, c...","{'enrich': 3, 'data': 4, 'stream': 2, 'core': ..."
29,GiG Broker,How can consumers access data from GiG Broker?,Consumers authenticate to the service and can ...,"[consumer, access, data, game, innovation, gro...","[consumer, authenticate, service, tap, necessa...","[consumer, access, data, game, innovation, gro...","{'consumer': 2, 'access': 1, 'data': 4, 'game'..."
38,GiG Broker,What is the purpose of the CRM Payload in GiG ...,The CRM Payload is a stream tailored for CRM t...,"[purpose, crm, payload, game, innovation, grou...","[crm, payload, stream, tailor, crm, collect, s...","[purpose, crm, payload, game, innovation, grou...","{'purpose': 1, 'crm': 3, 'payload': 3, 'game':..."


In [8]:
import pickle

data = dict(zip(df.index, df['Combined_Token_Counts']))

output_pickle_path = os.path.join(data_dir, "gig_docs_tokenized.pkl")
with open(output_pickle_path, 'wb') as f:
    pickle.dump(data, f)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# Fit the vectorizer on the 'Combined_Cleaned' column
matrix = vectorizer.fit_transform(df['Combined_Tokenized'].apply(lambda x: ' '.join(x)))

output_pickle_path = os.path.join(data_dir, "countvector_data.pkl")
with open(output_pickle_path, 'wb') as f:
    pickle.dump({
        "vectorizer": vectorizer,
        "matrix": matrix,
        "doc_ids": df.index.tolist()
    }, f)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit and transform
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Combined_Tokenized'].apply(lambda x: ' '.join(x)))

output_pickle_path = os.path.join(data_dir, "tfidf_data.pkl")
with open(output_pickle_path, "wb") as f:
    pickle.dump({
        "vectorizer": vectorizer,
        "matrix": tfidf_matrix,
        "doc_ids": df.index.tolist()
    }, f)