# Exploratory Data Analysis

### Content:
- [Requiered Packages & importing data](#import-and-clean)
- [Cleaning data](#cleaning-data)
- [Average Document Length (Dirty text)](#dirty-text)
- [Average Document Length (clean text)](#clean-text)
- [Lemmatizing Documents](#lemm)
- [Ratio between dirty and clean texts](#ratio)
- [Lexical diversity](#lex-div)


### Importing the requiered packages <a class="anchor" id="import-and-clean"></a>

In [58]:
# !pip install seaborn
# !pip install plotly
# !pip install matplotlib
# !pip install textblob
# !pip install nltk
# !pip install gensim
# !pip install spacy
!pip install nbformat




You should consider upgrading via the 'C:\Users\iliya\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [2]:
# basics
import os
import pandas as pd
import numpy as np

# for plots
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# for text processing
from textblob import TextBlob
import re
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
import spacy

### Import and clean data 
We import the data from a folder (Data) that contains the individuele documents as .txt files.

In [3]:
%%time 
# ~Wall time: 12 s
# Create an empty dataframe 
txt_data = pd.DataFrame(columns=['text'])

# path to data 


# importing data 
# for folder in os.listdir('../Final/documents'):
p = '../Final/documents/'
for f in os.listdir(p):
    title = f
    path = p+f
    sub_data = open(path, 'r',  encoding='utf-8').read()
    sub_ser = pd.Series(sub_data,index=['text'])
    txt_data = txt_data.append(sub_ser, ignore_index=True)



CPU times: total: 109 ms
Wall time: 119 ms




In [4]:
print("Number of documents: {}".format(len(txt_data)))
txt_data.head()

Number of documents: 121


Unnamed: 0,text
0,"\n<doc id=""564261"" url=""https://bg.wikipedia.o..."
1,"\n<doc id=""63292"" url=""https://bg.wikipedia.or..."
2,"\n<doc id=""125558"" url=""https://bg.wikipedia.o..."
3,"\n<doc id=""206545"" url=""https://bg.wikipedia.o..."
4,"\n<doc id=""104247"" url=""https://bg.wikipedia.o..."


### Cleaning data  <a class="anchor" id="cleaning-data"></a>
Next, we creat a few simle functions that help us clean the data.
- Box-plot functions to shows the number of words per document
- Cleaning functions that performs the basic cleaning steps
- Stop word removal function

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iliya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
!python -m spacy download en_core_web_lg 


^C


In [37]:
%%time 
# ~Wall time: 1,85s
#Box plot function
def plotly_box_plot(data, df_col,
                    ytext='Document length', 
                    titletext='Document length distribution within the corpus'):
    
    '''
    Creates a box plot using plotly.express 
    
    df:        Pandas DataFrame
    df_col:    Numarical column in DataFrame
    ytext:     String with y-axis text
    titletext: Sting with plot title 
    '''
    
    # lenght of the different documents
    fig = px.box(data, y=df_col, points="all",
                color_discrete_sequence=['#115dcd'])
    # Main figure layout
    fig.update_layout(title=titletext)

    # Adding Axis labels
    fig.update_yaxes(title_text=ytext)

    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#8D8A89')

    # Addjusting background color
    fig.update_layout(paper_bgcolor='white', plot_bgcolor='#F7F7F7')
  
    #pritty plot
    fig.show()
    return
    
# histogram function
def histogram_plot(data, df_col,
                    ytext='Document length', 
                    titletext='Document length distribution within the corpus',
                    xtext='Number of tokens'):
    
    '''

    '''
    
    # lenght of the different documents
    fig = px.histogram(data, x=df_col,
                       opacity=0.8,
                       color_discrete_sequence=['#115dcd'])

    # Main figure layout
    fig.update_layout(title=titletext)

    # Adding Axis labels
    fig.update_yaxes(title_text=ytext)
    fig.update_xaxes(title_text=xtext)
    

    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#8D8A89')
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#8D8A89')

    # Addjusting background color
    fig.update_layout(paper_bgcolor='white', plot_bgcolor='#F7F7F7')

    #pritty plot
    fig.show()

    

    return
    
# data cleaning function    
def clean_text(data, df_col):
    """
    Performes the basic text cleaning steps
    
    df:        Pandas DataFrame
    df_col:    Numarical column in DataFrame
    """
    
    my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
    
    # remove punctuation
    data['clean_data'] = data[df_col].map(lambda text: re.sub('['+my_punctuation + ']+', ' ', text)) 
    # turn all lowercase
    data['clean_data'] = data['clean_data'].map(lambda text: text.lower())    
    # tokenize sentences 
    data['clean_data'] = data['clean_data'].map(lambda text:[w for w in text.split(' ') if w.strip() != '' or w == '\n']) 
    
    return data


# stop words removal function and lammatization
stopword_list = stopwords.words('english')
nlp = spacy.load("en_core_web_lg")

def lemmatizer(texts):
    texts = [text.replace("\n", "").strip() for text in texts]
    docs = nlp.pipe(texts)
    cleaned_lemmas = [[t.lemma_ for t in doc if t.lemma_ not in stopword_list] for doc in docs]

    return cleaned_lemmas


# word count functions
# sentence count function
# paragraph function


CPU times: total: 3.25 s
Wall time: 3.26 s


In [78]:
!pip install --upgrade plotly




You should consider upgrading via the 'C:\Users\iliya\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


### Average Document Length (Dirty text)<a class="anchor" id="dirty-text"></a>

In [36]:
%%time 
# ~Wall time: 1 sec
# Average Document Length
txt_data['dirty_data'] = txt_data['text'].map(lambda text:[w for w in text.split(' ') if w.strip() != '' or w == '\n'])
         # print ('here')
txt_data['dirty_length'] = txt_data['dirty_data'].apply(len)

# Average document length
print('Average document length: {}'.format(round(np.mean(txt_data['dirty_length']), 2)))

# box plot with document lenght
plotly_box_plot(txt_data, 'dirty_length', 
                ytext='Document length', 
                titletext='Document length distribution within the corpus')

# histogram plot with document lenght
histogram_plot(txt_data, 'dirty_length', ytext='Document length', titletext='Document length distribution within the corpus')


Average document length: 948.33


CPU times: total: 344 ms
Wall time: 320 ms


### Average Document Length (clean text))<a class="anchor" id="clean-text"></a>

In [38]:
%%time 
# 1,4 sec
# cleaning data
clean_text(txt_data, 'text') 

txt_data['clean_length'] = txt_data['clean_data'].apply(len)

# Average document length
print('Average document length: {}'.format(round(np.mean(txt_data['clean_length']), 2)))

# box plot with document lenght
plotly_box_plot(txt_data, 'clean_length', 
                ytext='Document length', 
                titletext='Document length distribution within the corpus')
histogram_plot(txt_data, 'clean_length', ytext='Document length', titletext='Document length distribution within the corpus')


Average document length: 998.31


CPU times: total: 250 ms
Wall time: 258 ms


### Lemmatizing Documents<a class="anchor" id="lemm"></a>

In [14]:
%%time 
# ~ Wall time: 2 min
# Lemmatizing and removing stop words 
txt_data['clean_data_lem'] = txt_data['clean_data'].apply(lemmatizer)


CPU times: total: 45 s
Wall time: 45 s


### Calculating the avarage word count ratio between dirty and clean texts<a class="anchor" id="ratio"></a>

In [18]:
# Calculating ratio between dirty and clean texts
txt_data['dc_ratio'] = (txt_data['dirty_length'] / txt_data['clean_length'])

print('Clean/dirty doc ratio:{}'.format(txt_data['dc_ratio'].mean()))

Clean/dirty doc ratio:0.93535759713286


### Lexical diversity<a class="anchor" id="lex-div"></a>

In [19]:
# Calculating lexical diversity
from lexical_diversity import lex_div as ld

# adding all the words together is a list
def word_bag(data, column):
    """
    This function add al the words from a pandas dataframe together in one list
    
    data:       pandas DataFrame
    column:     sting with the column of intrest
    """
    word_bag = []
    
    for words in data[column]:
        word_bag += words 
    
    return word_bag

# Calculating the standard lexical diversity
def standard_lex(word_list):
    """
    This function returns the standard lexical diversity

    """
    lengte_wd = len(word_list)
    uniqe_wd = len(set(word_list))
    
    return uniqe_wd/lengte_wd


ModuleNotFoundError: No module named 'lexical_diversity'

In [17]:
# Standard lexical diversity on the clean data
clean_text_list = word_bag(txt_data, 'clean_data')
print('Clean Standard lexical diversity: {}'.format(standard_lex(clean_text_list)))

# Standard lexical diversity on the dirty data
dirty_text_list = word_bag(txt_data, 'dirty_data')
print('Dirty Standard lexical diversity: {}'.format(standard_lex(dirty_text_list)))

#clean up
# del clean_text_list, dirty_text_list

Clean Standard lexical diversity: 0.10127066769094625
Dirty Standard lexical diversity: 0.10599087243136213


In [11]:
txt_data['clean_lex'] = txt_data['clean_data'].apply(standard_lex)
txt_data['dirty_lex'] = txt_data['dirty_data'].apply(standard_lex)

print('Dirty Standard lexical diversity: {}'.format(txt_data['dirty_lex'].mean()))
print('Clean Standard lexical diversity: {}'.format(txt_data['clean_lex'].mean()))

Dirty Standard lexical diversity: 0.44832733437684624
Clean Standard lexical diversity: 0.47505730871582647


### Plotting the lexical diversity

In [12]:
#
plotly_box_plot(txt_data, 'dirty_lex',
                    ytext='Dirty Standard lexical diversity', 
                    titletext='Dirty Standard lexical diversity')

#
histogram_plot(txt_data, 'dirty_lex')
