# Import Libraries

In [2]:
import json
import chardet
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1.0: Understand the Data

In [3]:
# Read in the data
with open('data.json', 'rb') as f:
    data = f.read()

# Detect the encoding
encoding = chardet.detect(data)['encoding']

# Decode the data
data = data.decode(encoding)

# Load the data as JSON
data = json.loads(data)

# Create a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
df.head()

Unnamed: 0,001,002,003,004,005,006,007,008,009,010,...,191,192,193,194,195,196,197,198,199,200
content,Govardhana K\nSenior Software Engineer\n\nBeng...,"Harini Komaravelli\nTest Analyst at Oracle, Hy...",Hartej Kathuria\nData Analyst Intern - Oracle ...,Ijas Nizamuddin\nAssociate Consultant - State ...,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","Jay Madhavi\nNavi Mumbai, Maharashtra - Email ...",Jitendra Babu\nFI/CO Consultant in Tech Mahind...,Jyotirbindu Patnaik\nAssociate consultant@SAP ...,Karthihayini C\nSystems Engineer - Infosys Lim...,Karthik GV\nArchitect - Microsoft India\n\nHyd...,...,Pawan Nag\nMicrosoft Certified System Engineer...,Shivam Sharma\nL1 Analyst in Microsoft project...,Gaikwad Dilip\nMicrosoft Dainamic Software Bil...,Moumita Mitra\n- Email me on Indeed: indeed.co...,"Suman Biswas\nSAP UI5 Lead, Native HANA Develo...","Mansi Thanki\nStudent\n\nJamnagar, Gujarat - E...",Anil Kumar\nMicrosoft Azure (Basic Management)...,Siddharth Choudhary\nMicrosoft Office Suite - ...,Valarmathi Dhandapani\nInvestment Banking Oper...,Pradeep Kumar\nSecurity Analyst in Infosys - C...
annotation,"[{'label': ['Companies worked at'], 'points': ...","[{'label': ['Companies worked at'], 'points': ...","[{'label': ['Skills'], 'points': [{'start': 22...","[{'label': ['Skills'], 'points': [{'start': 46...","[{'label': ['Skills'], 'points': [{'start': 18...","[{'label': ['Graduation Year'], 'points': [{'s...","[{'label': ['Graduation Year'], 'points': [{'s...","[{'label': ['Skills'], 'points': [{'start': 30...","[{'label': ['Skills'], 'points': [{'start': 21...","[{'label': ['Skills'], 'points': [{'start': 40...",...,"[{'label': ['Email Address'], 'points': [{'sta...","[{'label': ['Skills'], 'points': [{'start': 13...","[{'label': ['Skills'], 'points': [{'start': 95...","[{'label': ['Skills'], 'points': [{'start': 19...","[{'label': ['Companies worked at'], 'points': ...","[{'label': ['College Name'], 'points': [{'star...","[{'label': ['Location'], 'points': [{'start': ...","[{'label': ['Skills'], 'points': [{'start': 78...","[{'label': ['Skills'], 'points': [{'start': 92...","[{'label': ['Skills'], 'points': [{'start': 58..."


In [4]:
# Invert rows and columns
df = df.T
df.head()

Unnamed: 0,content,annotation
1,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ..."
2,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ..."
3,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22..."
4,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46..."
5,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18..."


# 2.0: Data Preparation and Cleaning

In [5]:
raw_text = df["content"].tolist()

raw_text = [text.lower() for text in raw_text]
raw_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in raw_text] # remove special characters
raw_text = [re.sub(r'[^\w\s]', '', text) for text in raw_text] # remove punctuation
raw_text = [re.sub(r'\d+', '', text) for text in raw_text] # remove numbers

stop_words = set(stopwords.words('english'))
raw_text = [[word for word in text.split() if word not in stop_words] for text in raw_text]

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

raw_text = [lemmatize_text(text) for text in raw_text]

for i in range(len(raw_text)):
    raw_text[i] = ' '.join(raw_text[i])

df['cleaned_text'] = raw_text
df.head()

Unnamed: 0,content,annotation,cleaned_text
1,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ...",govardhana k senior software engineer bengalur...
2,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ...",harini komaravelli test analyst oracle hyderab...
3,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22...",hartej kathuria data analyst intern oracle ret...
4,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46...",ijas nizamuddin associate consultant state str...
5,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18...",imgeeyaul ansari java developer pune maharasht...
