In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [2]:
import nltk
from nltk.corpus import  stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

In [3]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type
0,GERRES15609,<html> b'Author and/or Review architecture/des...,Responsibility
1,PHERES15784,<html> b'Should be able to develop custom dyna...,Responsibility
2,GERREQ10457,<html> b'Experience in working cross\\u2010fun...,Requirement
3,GERSKL27235,"<html> b'Previous business experience, includi...",Skill
4,HONSSK18415,b'Delivering fast and right the first \\U0001f...,SoftSkill


In [4]:
# train_data['label'] = train_data.groupby("Type", sort=False).ngroup()
# train_data.head()
le = LabelEncoder()
train_data['label'] = le.fit_transform(train_data['Type'])
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type,label
0,GERRES15609,<html> b'Author and/or Review architecture/des...,Responsibility,3
1,PHERES15784,<html> b'Should be able to develop custom dyna...,Responsibility,3
2,GERREQ10457,<html> b'Experience in working cross\\u2010fun...,Requirement,2
3,GERSKL27235,"<html> b'Previous business experience, includi...",Skill,4
4,HONSSK18415,b'Delivering fast and right the first \\U0001f...,SoftSkill,5


In [5]:
y = train_data['label']
train_data.drop('label', inplace=True, axis=1)
train_data

Unnamed: 0,Sentence_id,New_Sentence,Type
0,GERRES15609,<html> b'Author and/or Review architecture/des...,Responsibility
1,PHERES15784,<html> b'Should be able to develop custom dyna...,Responsibility
2,GERREQ10457,<html> b'Experience in working cross\\u2010fun...,Requirement
3,GERSKL27235,"<html> b'Previous business experience, includi...",Skill
4,HONSSK18415,b'Delivering fast and right the first \\U0001f...,SoftSkill
...,...,...,...
60110,UAERES18030,"b'In this position, you will \\u2705 utilize y...",Responsibility
60111,GERRES3026,"<html> b'In addition, this individual will be ...",Responsibility
60112,INDSSK5492,<br> b'Good problem \\u2705 solving skills.',SoftSkill
60113,PHESSK15092,b'Good Excel knowledge \\u2705 <br> .',SoftSkill


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60115 entries, 0 to 60114
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Sentence_id   60115 non-null  object
 1   New_Sentence  60115 non-null  object
 2   Type          60115 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [7]:
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('\n', ' ')
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('\"', '')
train_data['New_Sentence'][:5]
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('<html>', ' ')
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('</html>', '')
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('b\'', ' ')
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('<br>', '')
train_data['New_Sentence'] = train_data['New_Sentence'].str.replace('<p>', '')
train_data['New_Sentence'][:5]

0       Author and/or Review architecture/design an...
1       Should be able to develop custom dynamic sh...
2       Experience in working cross\\u2010functiona...
3       Previous business experience, including but...
4     Delivering fast and right the first \\U0001f5...
Name: New_Sentence, dtype: object

In [8]:
def punctuation_to_features(df, column):
    df[column] = df[column].str.replace('!',' exclamation ')
    df[column] = df[column].str.replace('?',' question ')
    df[column] = df[column].str.replace('\'',' quotation ')
    df[column] = df[column].str.replace('\"',' quotation ')
    df[column] = df[column].str.replace('/', 'slash')
    df[column] = df[column].str.replace('\\', 'backslash')
    
    
    return df[column]

In [9]:
train_data['New_Sentence'] = punctuation_to_features(train_data, 'New_Sentence')
train_data['New_Sentence'][4]

  df[column] = df[column].str.replace('?',' question ')
  df[column] = df[column].str.replace('\\', 'backslash')


' Delivering fast and right the first backslashbackslashU0001f517  time. quotation '

In [10]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

In [11]:
train_data['New_Sentence_tokenized'] = train_data.apply(lambda x : tokenize(x['New_Sentence']), axis=1)
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type,New_Sentence_tokenized
0,GERRES15609,Author andslashor Review architectureslashd...,Responsibility,"[Author, andslashor, Review, architectureslash..."
1,PHERES15784,Should be able to develop custom dynamic sh...,Responsibility,"[Should, be, able, to, develop, custom, dynami..."
2,GERREQ10457,Experience in working crossbackslashbacksla...,Requirement,"[Experience, in, working, with, a, slashp, lar..."
3,GERSKL27235,"Previous business experience, including but...",Skill,"[Previous, business, experience, including, bu..."
4,HONSSK18415,Delivering fast and right the first backslash...,SoftSkill,"[Delivering, fast, and, right, the, first, tim..."


In [12]:
def remove_stopwords(tokenized_column):
    stops = set(stopwords.words('english'))
    return [word for word in tokenized_column if not word in stops]

In [13]:
train_data['New_Sentence_tokenized'] = train_data.apply(lambda x: remove_stopwords(x['New_Sentence_tokenized']), axis=1)
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type,New_Sentence_tokenized
0,GERRES15609,Author andslashor Review architectureslashd...,Responsibility,"[Author, andslashor, Review, architectureslash..."
1,PHERES15784,Should be able to develop custom dynamic sh...,Responsibility,"[Should, able, develop, custom, dynamic, shape..."
2,GERREQ10457,Experience in working crossbackslashbacksla...,Requirement,"[Experience, working, slashp, larger, Engineer..."
3,GERSKL27235,"Previous business experience, including but...",Skill,"[Previous, business, experience, including, li..."
4,HONSSK18415,Delivering fast and right the first backslash...,SoftSkill,"[Delivering, fast, right, first, time, quotation]"


In [14]:
def apply_stemming(tokenized_column):
    stemmer = PorterStemmer()
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [15]:
train_data['New_Sentence_tokenized'] = train_data.apply(lambda x  : apply_stemming(x['New_Sentence_tokenized']),axis=1)
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type,New_Sentence_tokenized
0,GERRES15609,Author andslashor Review architectureslashd...,Responsibility,"[author, andslashor, review, architectureslash..."
1,PHERES15784,Should be able to develop custom dynamic sh...,Responsibility,"[should, abl, develop, custom, dynam, shape, o..."
2,GERREQ10457,Experience in working crossbackslashbacksla...,Requirement,"[experi, work, slashp, larger, engin, organ, m..."
3,GERSKL27235,"Previous business experience, including but...",Skill,"[previou, busi, experi, includ, limit, busi, m..."
4,HONSSK18415,Delivering fast and right the first backslash...,SoftSkill,"[deliv, fast, right, first, time, quotat]"


In [16]:
def rejoin_words(tokenized_columns):
    return (' '.join(tokenized_columns))

In [17]:
train_data['New_Sentence_text'] = train_data.apply(lambda x: rejoin_words(x['New_Sentence_tokenized']), axis=1)
train_data.head()

Unnamed: 0,Sentence_id,New_Sentence,Type,New_Sentence_tokenized,New_Sentence_text
0,GERRES15609,Author andslashor Review architectureslashd...,Responsibility,"[author, andslashor, review, architectureslash...",author andslashor review architectureslashdesi...
1,PHERES15784,Should be able to develop custom dynamic sh...,Responsibility,"[should, abl, develop, custom, dynam, shape, o...",should abl develop custom dynam shape object s...
2,GERREQ10457,Experience in working crossbackslashbacksla...,Requirement,"[experi, work, slashp, larger, engin, organ, m...",experi work slashp larger engin organ multipl ...
3,GERSKL27235,"Previous business experience, including but...",Skill,"[previou, busi, experi, includ, limit, busi, m...",previou busi experi includ limit busi manag en...
4,HONSSK18415,Delivering fast and right the first backslash...,SoftSkill,"[deliv, fast, right, first, time, quotat]",deliv fast right first time quotat


In [18]:
from sklearn.model_selection import train_test_split
X = train_data['New_Sentence_text']

vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(X)
X_bow = bow_transformer.transform(X)
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_bow)

# Transform entire BoW into tf-idf corpus
X_tfidf = tfidf_transformer.transform(X_bow)
print(X_tfidf)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=1)

  (0, 14991)	0.17346500145751848
  (0, 14988)	0.3075937779705623
  (0, 14949)	0.15173183139329047
  (0, 14802)	0.16551729223219488
  (0, 14387)	0.30332464004967113
  (0, 12870)	0.23283678034272715
  (0, 12218)	0.043069997055860856
  (0, 12171)	0.2090360581977436
  (0, 7034)	0.22158233969957972
  (0, 5214)	0.20061182945443426
  (0, 4484)	0.22476345188158028
  (0, 4183)	0.1439540533699957
  (0, 3999)	0.2830013545555561
  (0, 1202)	0.29560183948917396
  (0, 893)	0.4192696790213755
  (0, 781)	0.18906167079543285
  (0, 619)	0.21470668411273322
  (0, 163)	0.21776538947284943
  (1, 15962)	0.32783890149430434
  (1, 15155)	0.23447888135519193
  (1, 13952)	0.09473175418555858
  (1, 13698)	0.295296999410629
  (1, 13649)	0.41782115342060666
  (1, 13379)	0.3321905649725264
  (1, 12218)	0.05373669260625134
  :	:
  (60111, 5214)	0.11218030412115676
  (60111, 3381)	0.15315193305315786
  (60111, 3065)	0.19213791504096972
  (60111, 3031)	0.14813446822825357
  (60111, 2917)	0.15284854873845036
  (60111, 

In [23]:
# from keras.layers import LSTM, Dense, Embedding
# from keras.models import Sequential
xg_model = XGBClassifier()

y_hat = xg_model.fit(X_train, y_train).predict(X_test)
# model = Sequential([
#     Embedding(25000,128),
#     LSTM(128, dropout=0.25, recurrent_dropout=0.25),
#     Dense(1, activation='softmax')
# ])

# model.compile(
#     loss='binary_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )
# model.fit(X_train.toarray(), y_train, batch_size=32, epochs=10, verbose=2, validation_data=(X_test.toarray(), y_test))

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print('Accuracy:', accuracy_score(y_test, y_hat))
print('F1score:', f1_score(y_test, y_hat, average = 'macro'))

Accuracy: 0.7134460770723593
F1score: 0.6903367756227049


In [25]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,Sentence_id,New_Sentence
0,UAERES16346,<html> b'Collaborate across all of DB&T practi...
1,COGREQ15586,b'Strong \\u2705 technology expertise in Ident...
2,UAEREQ12722,b'Strong knowledge <br> on Service \\U0001f517...
3,COGSKL29155,<html> b'Architect scalable data processing an...
4,PHERES12551,"<html> b'Map client organization, build outsta..."


In [26]:
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('\n', ' ')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('\"', '')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('<html>', ' ')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('</html>', '')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('b\'', ' ')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('<br>', '')
test_data['New_Sentence'] = test_data['New_Sentence'].str.replace('<p>', '')
test_data['New_Sentence'][:5]

0       Collaborate across all of DB&T practices an...
1     Strong \\u2705 technology expertise in Identi...
2     Strong knowledge  on Service \\U0001f517 Virt...
3       Architect scalable data processing and anal...
4       Map client organization, build outstanding ...
Name: New_Sentence, dtype: object

In [27]:
test_data['New_Sentence'] = punctuation_to_features(test_data, 'New_Sentence')
test_data['New_Sentence'][4]

  df[column] = df[column].str.replace('?',' question ')
  df[column] = df[column].str.replace('\\', 'backslash')


'   Map client organization, build outstanding relationships with new business units, backslashbackslashU0001f4c2 and build a sales strategy for developing  new business  opportunities. quotation '

In [28]:
test_data['New_Sentence_tokenized'] = test_data.apply(lambda x : tokenize(x['New_Sentence']), axis=1)
test_data.head()

Unnamed: 0,Sentence_id,New_Sentence,New_Sentence_tokenized
0,UAERES16346,Collaborate across all of DB&T practices an...,"[Collaborate, across, all, of, DB, T, practice..."
1,COGREQ15586,Strong backslashbackslashu2705 technology exp...,"[Strong, technology, expertise, in, Identity, ..."
2,UAEREQ12722,Strong knowledge on Service backslashbacksla...,"[Strong, knowledge, on, Service, Virtualizatio..."
3,COGSKL29155,Architect scalable data processing and anal...,"[Architect, scalable, data, processing, and, a..."
4,PHERES12551,"Map client organization, build outstanding ...","[Map, client, organization, build, outstanding..."


In [29]:
test_data['New_Sentence_tokenized'] = test_data.apply(lambda x: remove_stopwords(x['New_Sentence_tokenized']), axis=1)
test_data.head()

Unnamed: 0,Sentence_id,New_Sentence,New_Sentence_tokenized
0,UAERES16346,Collaborate across all of DB&T practices an...,"[Collaborate, across, DB, T, practices, offeri..."
1,COGREQ15586,Strong backslashbackslashu2705 technology exp...,"[Strong, technology, expertise, Identity, Acce..."
2,UAEREQ12722,Strong knowledge on Service backslashbacksla...,"[Strong, knowledge, Service, Virtualization, q..."
3,COGSKL29155,Architect scalable data processing and anal...,"[Architect, scalable, data, processing, analyt..."
4,PHERES12551,"Map client organization, build outstanding ...","[Map, client, organization, build, outstanding..."


In [30]:
test_data['New_Sentence_tokenized'] = test_data.apply(lambda x  : apply_stemming(x['New_Sentence_tokenized']),axis=1)
test_data.head()

Unnamed: 0,Sentence_id,New_Sentence,New_Sentence_tokenized
0,UAERES16346,Collaborate across all of DB&T practices an...,"[collabor, across, db, t, practic, offer, ai, ..."
1,COGREQ15586,Strong backslashbackslashu2705 technology exp...,"[strong, technolog, expertis, ident, access, m..."
2,UAEREQ12722,Strong knowledge on Service backslashbacksla...,"[strong, knowledg, servic, virtual, quotat]"
3,COGSKL29155,Architect scalable data processing and anal...,"[architect, scalabl, data, process, analyt, so..."
4,PHERES12551,"Map client organization, build outstanding ...","[map, client, organ, build, outstand, relation..."


In [31]:
test_data['New_Sentence_text'] = test_data.apply(lambda x: rejoin_words(x['New_Sentence_tokenized']), axis=1)
test_data.head()

Unnamed: 0,Sentence_id,New_Sentence,New_Sentence_tokenized,New_Sentence_text
0,UAERES16346,Collaborate across all of DB&T practices an...,"[collabor, across, db, t, practic, offer, ai, ...",collabor across db t practic offer ai data iot...
1,COGREQ15586,Strong backslashbackslashu2705 technology exp...,"[strong, technolog, expertis, ident, access, m...",strong technolog expertis ident access manag q...
2,UAEREQ12722,Strong knowledge on Service backslashbacksla...,"[strong, knowledg, servic, virtual, quotat]",strong knowledg servic virtual quotat
3,COGSKL29155,Architect scalable data processing and anal...,"[architect, scalabl, data, process, analyt, so...",architect scalabl data process analyt solut in...
4,PHERES12551,"Map client organization, build outstanding ...","[map, client, organ, build, outstand, relation...",map client organ build outstand relationship n...


In [32]:
from sklearn.model_selection import train_test_split
X = train_data['New_Sentence_text']
X_test = test_data['New_Sentence_text']

vectorizer = CountVectorizer()

bow_transformer = vectorizer.fit(X)
X_bow = bow_transformer.transform(X)
X_test_bow = bow_transformer.transform(X_test)
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_bow)

# Transform entire BoW into tf-idf corpus
X_tfidf = tfidf_transformer.transform(X_bow)
X_test_tfidf = tfidf_transformer.transform(X_test_bow)
print(X_test_tfidf.shape)

# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=1)

(15029, 17006)


In [33]:
y_hat = xg_model.fit(X_tfidf, y).predict(X_test_tfidf)
y_hat
y_hat_categorical = le.inverse_transform(y_hat)
y_hat_categorical

array(['Responsibility', 'Skill', 'Skill', ..., 'SoftSkill',
       'Requirement', 'Requirement'], dtype=object)

In [34]:
output = pd.DataFrame({'Sentence_id': test_data.Sentence_id,
                       'Type': y_hat_categorical})
output.to_csv('submission.csv', index=False)