# Lingualytics

## Installation

Use this simple one-line command to install Lingualytics

In [None]:
!pip install lingualytics

## Preprocessing

In this example, we'll take a code-switched dataset and clean it by removing the digits, punctuation and stopwords.

In [None]:
from lingualytics.preprocessing import remove_lessthan, remove_punctuation, remove_stopwords
from lingualytics.stopwords import hi_stopwords,en_stopwords
from texthero.preprocessing import remove_digits
import pandas as pd
df = pd.read_csv(
   "https://github.com/lingualytics/py-lingualytics/raw/master/datasets/SAIL_2017/Processed_Data/Devanagari/validation.txt", header=None, sep='\t', names=['text','label']
)
pd.set_option('display.max_colwidth', None)
df['clean_text'] = df['text'].pipe(remove_digits) \
                                    .pipe(remove_punctuation) \
                                    .pipe(remove_stopwords,stopwords=en_stopwords.union(hi_stopwords))
df

## Classification

This example would train a classifier on any given dataset. It uses **Pytorch** internally to perform the training.

In [None]:
from lingualytics.learner import Learner

learner = Learner(model_type = 'bert',
                model_name = 'bert-base-multilingual-cased',
                dataset = 'SAIL_2017',
                train_bs = 16,
                num_train_epochs = 1)

learner.fit()

## Find topmost n-grams



In [None]:
from lingualytics.representation import get_ngrams
import pandas as pd
import plotly.express as px

df = pd.read_csv(
   "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)

# clean text
df['text'] = df['text'].pipe(remove_digits) \
                              .pipe(remove_punctuation) \
                              .pipe(remove_stopwords,stopwords=en_stopwords.union(hi_stopwords))
# get n-grams
n = 2
ngrams = get_ngrams(df['text'],n=n,merge=True)
ngrams = ngrams[:10]

# plot
tw = pd.DataFrame(ngrams,columns=[f'{n}-gram','count'])
fig = px.bar(tw,x=f'{n}-gram',y='count')
fig.show()