<a href="https://colab.research.google.com/github/hasan-sh/advanced-nlp/blob/main/Firstmodel_gabhoo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

from sklearn.metrics import *

In [4]:
!git clone https://github.com/hasan-sh/advanced-nlp.git

fatal: destination path 'advanced-nlp' already exists and is not an empty directory.


In [5]:
!pwd

/content


In [6]:
train_file = '/content/advanced-nlp/data/en_ewt-up-train.conllu'
test_file = '/content/advanced-nlp/data/en_ewt-up-test.conllu'

In [7]:
def read_data(file_path, save_to_csv=False):
    """
    This function reads a CoNLL-U format file and converts it into a pandas DataFrame.
    Each row in the DataFrame corresponds to a token in the file, and columns
    correspond to different features of the token, such as the token itself, its lemma, 
    part-of-speech tag, and syntactic dependency information.
    
    Parameters:
    file_path (str): The path to the input CoNLL-U format file.
    save_to_csv (bool): A boolean flag indicating whether to save the resulting DataFrame 
                        to a CSV file. Default is False.
                        
    Returns:
    df (pandas.DataFrame): A pandas DataFrame containing the token-level information from
                           the input file.
    """
    
    # Open and read the input file
    with open(file_path, 'r', encoding='utf-8') as f:
        train_data = f.read()
    
    # Split the file into individual documents, each separated by a blank line
    data = []
    for doc_i, doc in enumerate(train_data.split('\n\n')):
        doc = doc.split('\n')
        sentences = ''
        for line in doc:
            # Skip lines starting with '#' (comment lines)
            if line and line[0] != '#':
                line = line.split('\t')
                line.insert(0, str(doc_i))
                sentences += '\t'.join(line) + '\n'
        data.append(sentences)
    
    # Create a pandas DataFrame from the token-level data
    train_df = pd.DataFrame([x.split('\t') for sent in data for x in sent.split('\n') if x])
    
    # Rename the columns of the DataFrame
    train_df = train_df.rename(columns={
        0:'sent_id', 
        1:'token_id', 
        2:'token', 
        3:'lemma', 
        4:'POS', 
        5:'uni_POS',
        6:'morph_type', 
        7:'distance_head', 
        8:'dep_label', 
        9:'dep_rel', 
        10:'space', 
        11:'probbank'
    })
    
    # Convert the DataFrame from wide to long format
    df = train_df.melt(
        id_vars=[i for i in train_df.columns[:12]], 
        var_name="notneeded", 
        value_name="target"
    )
    
    # Drop the 'notneeded' column and any rows that contain missing values
    #df["sent_id"]=df['sent_id'].str.cat((df['notneeded'].astype(int)-12).astype(str) , sep="_" )
    df["repetion_id"]=df["notneeded"]-12
    df.drop(['notneeded'], axis=1, inplace=True)
    df = df[df['target'].notna()]
    
    # Optionally save the resulting DataFrame to a CSV file
    if save_to_csv:
        df.to_csv('/content/advanced-nlp/data/test.tsv', sep='\t', index=False)
    
    # Return the resulting DataFrame

    return df

In [8]:
train = read_data(train_file,save_to_csv=True)
test = read_data(test_file, save_to_csv=True)

In [21]:
def make_binary_label(df):
  df = df.assign(label=[0 if target=="_" or target=="V" else 1 for target in df['target']])
  df= df.drop('target', axis=1)
  return df

#clean column
def columns_cleaning(df): #political choices inside
  df=df[df["distance_head"]!="_"]#like this
  df["distance_head"]=df["distance_head"].astype(int) #WARNING IS FROM HERE
  
  df["sent_id"]=df["sent_id"].astype(int) #WARNING IS FROM HERE

  df= df.drop('token_id', axis=1) #or this
  df=df[['sent_id', 'repetion_id','token', 'lemma', 'POS', 'uni_POS', 'morph_type',
        'distance_head', 'dep_label', 'dep_rel', 'space', 'probbank' ,'label']]

  return df


def make_NER(df):
  return df

cols_to_encode=[ 'token', 'lemma', 'POS', 'uni_POS',
       'morph_type', 'dep_label', 'dep_rel', 'space',
       'probbank']

def create_encoding(df,cols_to_encode):
  # create a LabelEncoder objec
  le = LabelEncoder()

  # iterate over the columns to encode
  for col in cols_to_encode:
      df[col] = le.fit_transform(df[col])
  return df




##WITH ONLY TOKEN

In [16]:
tokens_train=train[["token","target"]]
tokens_test=test[["token","target"]]

In [17]:
tokens_train=make_binary_label(tokens_train)
tokens_test=make_binary_label(tokens_test)



#cols_to_encode=['token']
#tokens_train=create_encoding(tokens_train,cols_to_encode)
#tokens_test=create_encoding(tokens_test,cols_to_encode)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(tokens_train['token'])
X_test = vectorizer.transform(tokens_test['token'])
y_train=tokens_train['label']
y_test=tokens_test['label']


 MODEL

In [None]:
#instantiate the model
log_regression = LogisticRegression(penalty='l2')

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

f1 = f1_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
print(f"{f1=}")
print(f"{prec=}")

In [None]:
y_pred.sum()

##WITH BASIC FEATURES

In [None]:

##WRAPPER FUNCTION
def process_dataset(df):
  
  df=make_binary_label(df)

  df=columns_cleaning(df)

  df=make_NER(df)

  cols_to_encode=[ 'token', 'lemma', 'POS', 'uni_POS', 'morph_type', 'dep_label', 'dep_rel', 'space', 'probbank']
  df=create_encoding(df,cols_to_encode)

  return df

In [None]:
df_train=process_dataset(train)
df_test=process_dataset(test)


In [None]:
X_train=df_train
y_train=X_train.pop('label')


X_test=df_test
y_test=X_test.pop('label')


THE MODEL


In [None]:
#instantiate the model
log_regression = LogisticRegression(penalty='l2')

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

f1 = f1_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
print(f"{f1=}")
print(f"{prec=}")


In [None]:
y_pred.sum()

SOoo the issue is that keeps crashing if we put countvector or onehotvector along with other feature

##WITH COUNTVECTOR TOKEN




In [10]:
tokens_train=train
tokens_test=test

In [168]:
tokens_train=make_binary_label(tokens_train)
tokens_test=make_binary_label(tokens_test)




t_train=columns_cleaning(tokens_train)

cols_to_encode=[ 'POS', 'uni_POS', 'morph_type', 'dep_label', 'dep_rel', 'space', 'probbank']
t_train=create_encoding(t_train,cols_to_encode)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [169]:

#cols_to_encode=['token']
#tokens_train=create_encoding(tokens_train,cols_to_encode)
#tokens_test=create_encoding(tokens_test,cols_to_encode)
vectorizer = CountVectorizer()
train_vec_token = vectorizer.fit_transform(t_train['token'])
#test_vec_token = vectorizer.transform(tokens_test['token'])


##COLUMN TRANSFORMER

In [22]:
train.columns

Index(['sent_id', 'token_id', 'token', 'lemma', 'POS', 'uni_POS', 'morph_type',
       'distance_head', 'dep_label', 'dep_rel', 'space', 'probbank', 'target',
       'repetion_id'],
      dtype='object')

In [23]:
col=['sent_id', 'token_id', 'POS', 'uni_POS', 'morph_type',
       'distance_head', 'dep_label', 'dep_rel', 'space', 'probbank', 'target',
       'repetion_id']

In [30]:
ttrain=train[col]
ttest=test[col]

In [32]:
def columns_cleaning(df): #political choices inside
  df=df[df["distance_head"]!="_"]#like this
  df["distance_head"]=df["distance_head"].astype(int) #WARNING IS FROM HERE
  
  df["sent_id"]=df["sent_id"].astype(int) #WARNING IS FROM HERE

  df= df.drop('token_id', axis=1) #or this
  df=df[['sent_id', 'repetion_id','POS', 'uni_POS', 'morph_type',
        'distance_head', 'dep_label', 'dep_rel', 'space', 'probbank' ,'label']]

  return df

In [33]:
cols_to_encode=[ 'token', 'lemma', 'POS', 'uni_POS',
       'morph_type', 'dep_label', 'dep_rel', 'space',
       'probbank']

def create_encoding(df,cols_to_encode):
  # create a LabelEncoder objec
  le = LabelEncoder()

  # iterate over the columns to encode
  for col in cols_to_encode:
      df[col] = le.fit_transform(df[col])
  return df


In [34]:
ttrain=make_binary_label(ttrain)
ttest=make_binary_label(ttest)

ttrain=columns_cleaning(ttrain)
ttest=columns_cleaning(ttest)

col=[ 'POS', 'uni_POS',
       'morph_type', 'dep_label', 'dep_rel', 'space',
       'probbank']

ttrain=create_encoding(ttrain,col)
ttest=create_encoding(ttest,col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [35]:
X_train=ttrain
y_train=X_train.pop('label')


X_test=ttest
y_test=X_test.pop('label')


In [36]:
ttrain

Unnamed: 0,sent_id,repetion_id,POS,uni_POS,morph_type,distance_head,dep_label,dep_rel,space,probbank
0,0,0,11,24,74,0,46,0,30,1
1,0,0,12,15,100,1,44,3130,30,1
2,0,0,11,24,74,1,25,2995,31,1
3,0,0,12,6,100,1,44,3130,31,1
4,0,0,0,17,41,6,4,9101,31,1
...,...,...,...,...,...,...,...,...,...,...
7077906,7506,34,15,41,90,123,2,756,31,1238
7077907,7506,34,4,9,100,134,9,1097,31,1
7077908,7506,34,10,29,36,134,35,1098,31,1
7077909,7506,34,15,42,63,4,14,7504,31,1300


THE MODEL


In [37]:
#instantiate the model
log_regression = LogisticRegression(penalty='l2')

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

f1 = f1_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
print(f"{f1=}")
print(f"{prec=}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


f1=0.0
prec=0.0


In [39]:
y_pred.sum()

3

##COLUMNT TRANSFORMER ATTEMPT

In [17]:
ct = ColumnTransformer([("token_vec", OneHotEncoder(), ["token"])], remainder="passthrough",sparse_threshold=0)

In [None]:
dataset = ct.fit_transform(ttrain)

In [43]:
dataset.shape

(103246, 4880)

In [46]:
dataset.shape

(103246, 4881)