# Primary Text Processing

## Summary

In [1]:
# Code summary of below

In [2]:
!python --version

Python 3.10.13


## Import

### Libraries

In [3]:
import os 
import codecs

from backend import *

### Definitions

In [4]:
texts = '../texts/fiction/'

libCols = ['author','pub_year','title','text']
tokenOHCO = ['w_id','part_num','para_num', 'sent_num', 'token_num']
tokenCols = ['p_id', 'start', 'stop', 'text', 'token_id', 'head_id', 'rel', 'pos', 'lemma', 'anim', 'aspect', \
             'case', 'degree', 'gender', 'mood', 'number', 'person', 'tense', 'verb_form', 'voice']

## Library

In [6]:
libDf = pd.DataFrame(columns = libCols)
for t in os.listdir(texts): 
    if t[-4:] == '.txt': 
        #print(t)
        info = re.match(r'(\w+)-(\d{4})-(.+).txt', t)
        with codecs.open(texts+t, 'r', encoding='windows-1251') as f: 
            textytext = f.read()
        libDf = libDf.concat({
            'author': info.group(1),
            'pub_year': int(info.group(2)), 
            'title': info.group(3), 
            'text': textytext
        }, ignore_index=True)
        
libDf = libDf.sort_values(libCols[1:3]).reset_index().drop(['index'], axis=1)
libDf.index.name = 'w_id'
libTextsDf = libDf[[libCols[3]]]
libDf = libDf.drop(columns=[libCols[3]])
libDf

AttributeError: 'DataFrame' object has no attribute 'concat'

In [None]:
libTextsDf

### Regularize

In [None]:
confessionDf = textRegularize(libTextsDf, 10)
#confessionDf

In [None]:
dpDf = textRegularize(libTextsDf, 14)
#dpDf

In [None]:
motherTextDf = textRegularize(libTextsDf, 6)
#motherTextDf

In [None]:
# split into chapters
detstvoTextDf = libTextsDf.loc[[22]]
detstvoTextDf = pd.DataFrame(data=detstvoTextDf.text.str.split(r'\*\w+\*\n\n').to_list()[0]).reset_index()
detstvoTextDf = detstvoTextDf.rename(columns={'index':'chap', 0:'text'})[1:]
detstvoTextDf = detstvoTextDf.text.str.split('\n\n', expand=True).stack().to_frame()
#detstvoTextDf

## Words

### Tokens

In [7]:
motherTokenDf = pd.read_pickle('./proc/MotherTokendf.pkl')#.set_index(['p_id','token_id'])

In [8]:
motherTokenDf.set_index(['p_id','token_id'])

Unnamed: 0_level_0,Unnamed: 1_level_0,start,stop,text,head_id,rel,pos,lemma,anim,aspect,case,degree,gender,mood,number,person,tense,verb_form,voice
p_id,token_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1_1,0,6,Каждый,1_2,det,DET,каждый,,,Acc,,Masc,,Sing,,,,
1,1_2,7,11,день,1_27,obl,NOUN,день,Inan,,Acc,,Masc,,Sing,,,,
1,1_3,12,15,над,1_5,case,ADP,над,,,,,,,,,,,
1,1_4,16,23,рабочей,1_5,amod,ADJ,рабочий,,,Ins,Pos,Fem,,Sing,,,,
1,1_5,24,33,слободкой,1_27,obl,NOUN,слободка,Inan,,Ins,,Fem,,Sing,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4436,1_1,0,6,Кто-то,1_2,nsubj,PRON,кто-то,,,Nom,,,,,,,,
4436,1_2,7,14,ответил,1_0,root,VERB,ответить,,Perf,,,Masc,Ind,Sing,,Past,Fin,Act
4436,1_3,15,17,ей,1_2,iobj,PRON,она,,,Dat,,Fem,,Sing,3,,,
4436,1_4,18,25,громким,1_5,amod,ADJ,громкий,,,Dat,Pos,,,Plur,,,,


### Vocab

In [9]:
motherRankDf = GetRankDf(motherTokenDf)
motherRankDf

Unnamed: 0_level_0,count,rank
lemma,Unnamed: 1_level_1,Unnamed: 2_level_1
она,1997,1
он,1813,2
я,1120,3
мать,899,4
весь,807,5
...,...,...
прибавиться,1,11873
имущество,1,11874
еевнимательный,1,11875
пестрядинный,1,11876


## Vector Space

In [14]:
import fasttext

### BOW

In [None]:
motherBOW = motherTokenDf.groupby(['p_id', 'lemma']).lemma.count().to_frame('n')

In [None]:
motherBOW

In [None]:
motherDTCM = motherBOW.n.unstack(fill_value=0)
motherDTCM

### TFIDF

### TTM

In [None]:
TTM = pd.get_dummies(motherTokenDf['lemma'], columns=['lemma'], prefix_sep='', drop_first=False).reset_index(drop=True).iloc[:,1:]
TTM.index.name = 'time_id'
TTM = TTM.astype('int')
TTM

In [None]:
cfg = {'figsize': (20,1)}

In [None]:
TTM['мать'].plot(**cfg);

## Vectorization

## Primary Component Analysis (PCA)

In [None]:
from navec import Navec

path = './models/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [None]:
import torch
from slovnet.model.emb import NavecEmbedding

In [None]:
emb = NavecEmbedding(navec)
input = torch.tensor([1, 2, 0])
output = emb(input)

In [None]:
output.shape

In [None]:
def vectorize_text(text, embeddings):
    # Simple tokenization based on spaces (for demonstration purposes)
    tokens = text.split()
    
    # Initialize an empty tensor for storing embeddings
    vector = torch.zeros(embeddings.dim)
    
    for token in tokens:
        # Add the embeddings of each token; you might want to handle OOV (out-of-vocabulary) tokens
        vector += embeddings.get_vecs_by_tokens(token.lower(), lower_case_backup=True)
    
    # Average the vectors (simple approach)
    return vector / len(tokens)

In [None]:
data = {'text': ['Hello world', 'PyTorch vectorization example', 'Text vectorization with PyTorch']}
df = pd.DataFrame(data)

In [None]:
# Loading GloVe embeddings
glove = GloVe(name='6B', dim=100)  # Example: 100-dimensional GloVe vectors


In [None]:
# Apply vectorization to each row in the DataFrame
df['vector'] = df['text'].apply(lambda x: vectorize_text(x, glove))