# Import Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1.0: Understand the Data

In [2]:
# Read the data
data = []

with open("data.json", 'rb') as f:
    for line in f:
        j_content = json.loads(line)
        data.append(j_content)

# Convert to dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,content,annotation
0,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ..."
1,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ..."
2,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22..."
3,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46..."
4,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18..."


# 2.0: Data Preparation and Cleaning

In [3]:
raw_text = df["content"].tolist()

raw_text = [text.lower() for text in raw_text]
raw_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in raw_text] # remove special characters
raw_text = [re.sub(r'[^\w\s]', '', text) for text in raw_text] # remove punctuation
raw_text = [re.sub(r'\d+', '', text) for text in raw_text] # remove numbers

stop_words = set(stopwords.words('english'))
raw_text = [[word for word in text.split() if word not in stop_words] for text in raw_text]

# Lemmatization
lemmatizer = WordNetLemmatizer()
raw_text = [[lemmatizer.lemmatize(w) for w in text] for text in raw_text]
raw_text = [' '.join(raw_text[i]) for i in range(len(raw_text))]

df['cleaned_text'] = raw_text
df.head()

Unnamed: 0,content,annotation,cleaned_text
0,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ...",govardhana k senior software engineer bengalur...
1,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ...",harini komaravelli test analyst oracle hyderab...
2,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22...",hartej kathuria data analyst intern oracle ret...
3,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46...",ijas nizamuddin associate consultant state str...
4,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18...",imgeeyaul ansari java developer pune maharasht...


In [4]:
df = df.rename(columns={'annotation': 'label'})
df = df.drop(columns=['content'])
df.head()

Unnamed: 0,label,cleaned_text
0,"[{'label': ['Companies worked at'], 'points': ...",govardhana k senior software engineer bengalur...
1,"[{'label': ['Companies worked at'], 'points': ...",harini komaravelli test analyst oracle hyderab...
2,"[{'label': ['Skills'], 'points': [{'start': 22...",hartej kathuria data analyst intern oracle ret...
3,"[{'label': ['Skills'], 'points': [{'start': 46...",ijas nizamuddin associate consultant state str...
4,"[{'label': ['Skills'], 'points': [{'start': 18...",imgeeyaul ansari java developer pune maharasht...
