In [1]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder


In [2]:
train_file = 'data/train_ner.tsv'
test_file = 'data/test_ner.tsv'
train_df = pd.read_csv(train_file, delimiter='\t')
test_df = pd.read_csv(test_file, delimiter='\t')

  train_df = pd.read_csv(train_file, delimiter='\t')
  test_df = pd.read_csv(test_file, delimiter='\t')


In [3]:
train_df = train_df[train_df["distance_head"]!="_"]

In [4]:
train_df = train_df.reset_index()

In [5]:
train_df

Unnamed: 0,index,sent_id,token_id,token,lemma,POS,uni_POS,morph_type,distance_head,dep_label,dep_rel,space,probbank,ner,token_bigram,pos_bigram,target
0,0,0,1.0,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,PERSON,"(None, 'Al')","(None, 'PROPN')",_
1,1,0,2.0,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,PERSON,"('Al', '-')","('PROPN', 'PUNCT')",_
2,2,0,3.0,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,PERSON,"('-', 'Zaman')","('PUNCT', 'PROPN')",_
3,3,0,4.0,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_,"('Zaman', ':')","('PROPN', 'PUNCT')",_
4,4,0,5.0,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,NORP,"(':', 'American')","('PUNCT', 'ADJ')",_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035899,1035923,7506,131.0,graduated,graduate,VERB,VBN,Tense=Past|VerbForm=Part,123,advcl,123:advcl:since,_,graduate.01,_,"('have', 'graduated')","('AUX', 'VERB')",_
1035900,1035924,7506,132.0,and,and,CCONJ,CC,_,134,cc,134:cc,_,_,_,"('graduated', 'and')","('VERB', 'CCONJ')",_
1035901,1035925,7506,133.0,i,i,PRON,PRP,Case=Nom|Number=Sing|Person=1|PronType=Prs,134,nsubj,134:nsubj,_,_,PERSON,"('and', 'i')","('CCONJ', 'PRON')",ARG0
1035902,1035926,7506,134.0,hate,hate,VERB,VBP,Mood=Ind|Tense=Pres|VerbForm=Fin,4,conj,4:conj:and,_,hate.01,_,"('i', 'hate')","('PRON', 'VERB')",V


In [6]:
# Separate the categorical and numerical features
cat_features = train_df[['token', 'lemma', 'POS', 'uni_POS', 'morph_type',
                     'dep_label', 'dep_rel', 'space', 'probbank', 'ner',
                     'token_bigram', 'pos_bigram']]
num_features = train_df[['sent_id', 'token_id', 'distance_head']]
num_features = num_features.astype(int)
# Apply the hashing trick to the categorical features
hasher = FeatureHasher(n_features=12, input_type='string')
hashed_features = hasher.transform(cat_features.values.astype(str))

# Convert the resulting sparse matrix to a dense matrix and concatenate with the numerical features
hashed_features = pd.DataFrame(hashed_features.toarray())
X = pd.concat([hashed_features, num_features], axis=1)

In [7]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,sent_id,token_id,distance_head
0,-2.0,1.0,-1.0,2.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-1.0,0.0,0,1,0
1,0.0,-1.0,0.0,3.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0,2,1
2,-1.0,0.0,-1.0,2.0,0.0,-2.0,-1.0,0.0,0.0,1.0,1.0,1.0,0,3,1
3,0.0,1.0,3.0,4.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0,4,1
4,1.0,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,2.0,1.0,0,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035899,0.0,1.0,1.0,2.0,0.0,-1.0,1.0,0.0,0.0,-1.0,1.0,0.0,7506,131,123
1035900,0.0,0.0,-1.0,4.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,0.0,7506,132,134
1035901,-1.0,1.0,0.0,2.0,-1.0,0.0,-1.0,1.0,0.0,-3.0,1.0,1.0,7506,133,134
1035902,0.0,1.0,0.0,2.0,-2.0,1.0,0.0,0.0,-2.0,0.0,0.0,-2.0,7506,134,4


# First Model

In [8]:
train_df = train_df.assign(label=[0 if target=="_" or target=="V" else 1 for target in train_df['target']])

In [9]:
train_df.label

0          0
1          0
2          0
3          0
4          0
          ..
1035899    0
1035900    0
1035901    1
1035902    0
1035903    1
Name: label, Length: 1035904, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, train_df.label, test_size=0.2, random_state=42)

In [11]:
#instantiate the model
log_regression = LogisticRegression(penalty='l2')

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

f1 = f1_score(y_test,y_pred, average='weighted')
prec = precision_score(y_test,y_pred, average='weighted')
print(f"{f1=}")
print(f"{prec=}")



f1=0.8715245319534531
prec=0.8574267005571962


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Second Model

In [12]:
first_model_output = log_regression.predict(X)



In [13]:
len(first_model_output)

1035904

In [14]:
X['helper_1'] = first_model_output

In [15]:
def make_targets_categorical_label(df):
    label_encoder = LabelEncoder()
    y_train = df['target']
    label_encoder.fit(y_train)
    y_train = label_encoder.transform(y_train)
    print(f'{len(set(y_train))}: Arguement categories')
    return y_train
y = make_targets_categorical_label(train_df)

61: Arguement categories


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
#instantiate the model
log_regression = LogisticRegression(penalty='l2')

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
f1 = f1_score(y_test,y_pred, average='weighted')
prec = precision_score(y_test,y_pred, average='weighted')
print(f"{f1=}")
print(f"{prec=}")

f1=0.8150204836393925
prec=0.7636128367982268


  _warn_prf(average, modifier, msg_start, len(result))
