In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [25]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2023.6.3-cp39-cp39-macosx_11_0_arm64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.6.3


In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hakanmeva/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
from nltk.tokenize import sent_tokenize

In [83]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [84]:
df_burse_list = []
for year in years:
    df = pd.read_parquet('datasets/df_burse_fonduri_mutuale_' + str(year) + '.parquet')
    df_burse_list.append(df)
df_burse = pd.concat(df_burse_list)

In [85]:
df_pers_list = []
for year in years:
    df = pd.read_parquet('datasets/df_finante_pers_' + str(year) + '.parquet')
    df_pers_list.append(df)
df_pers = pd.concat(df_pers_list)

## General Stats

In [86]:
print('Length of Burse fonduri mutuale dataset: {}'.format(df_burse.shape[0]))
print('Length of Finante personale dataset: {}'.format(df_pers.shape[0]))

Length of Burse fonduri mutuale dataset: 13669
Length of Finante personale dataset: 12240


In [87]:
burse_mem = 0
for year in years:
    burse_mem += os.path.getsize('datasets/df_burse_fonduri_mutuale_' + str(year) + '.parquet')
print('Total file size for Burse fonduri mutuale dataset: {}M'.format(burse_mem / 1000000))
burse_pers = 0
for year in years:
    burse_pers += os.path.getsize('datasets/df_finante_pers_' + str(year) + '.parquet')
print('Total file size for Finante personale dataset: {}M'.format(burse_pers / 1000000))

Total file size for Burse fonduri mutuale dataset: 17.916267M
Total file size for Finante personale dataset: 18.619099M


### Burse DS Sentence Stats

In [88]:
df_burse['title_sent_len'] = df_burse.title.apply(sent_tokenize).str.len()
df_burse['headline_sent_len'] =df_burse.headline.apply(sent_tokenize).str.len()
df_burse['summary_sent_len'] = df_burse.description.apply(sent_tokenize).str.len()
df_burse['article_sent_len'] =df_burse.article_text.apply(sent_tokenize).str.len()

In [89]:
df_burse[df_burse.title_sent_len > 0]['title_sent_len'].describe()

count    13669.000000
mean         1.659668
std          0.908322
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          8.000000
Name: title_sent_len, dtype: float64

In [90]:
df_burse[df_burse.title_sent_len > 0]['title_sent_len'].sum()

22686

In [91]:
df_burse[df_burse.headline_sent_len > 0]['headline_sent_len'].describe()

count    13669.000000
mean         1.430317
std          0.655724
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          5.000000
Name: headline_sent_len, dtype: float64

In [92]:
df_burse[df_burse.headline_sent_len > 0]['headline_sent_len'].sum()

19551

In [93]:
df_burse[df_burse.summary_sent_len > 0]['summary_sent_len'].describe()

count    10674.000000
mean         1.411373
std          0.815999
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         16.000000
Name: summary_sent_len, dtype: float64

In [94]:
df_burse[df_burse.summary_sent_len > 0]['summary_sent_len'].sum()

15065

In [95]:
df_burse[df_burse.article_sent_len > 0]['article_sent_len'].describe()

count    13251.000000
mean        10.070334
std         10.042974
min          1.000000
25%          5.000000
50%          8.000000
75%         12.000000
max        169.000000
Name: article_sent_len, dtype: float64

In [96]:
df_burse[df_burse.article_sent_len > 0]['article_sent_len'].sum()

133442

## Finante Pers DS Sentence Stats

In [97]:
df_pers['title_sent_len'] = df_pers.title.apply(sent_tokenize).str.len()
df_pers['headline_sent_len'] =df_pers.headline.apply(sent_tokenize).str.len()
df_pers['summary_sent_len'] = df_pers.description.apply(sent_tokenize).str.len()
df_pers['article_sent_len'] =df_pers.article_text.apply(sent_tokenize).str.len()

In [98]:
df_pers[df_pers.title_sent_len > 0]['title_sent_len'].describe()

count    12240.000000
mean         1.670180
std          0.913334
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          8.000000
Name: title_sent_len, dtype: float64

In [99]:
df_pers[df_pers.title_sent_len > 0]['title_sent_len'].sum()

20443

In [100]:
df_pers[df_pers.headline_sent_len > 0]['headline_sent_len'].describe()

count    12240.000000
mean         1.439542
std          0.622275
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          5.000000
Name: headline_sent_len, dtype: float64

In [101]:
df_pers[df_pers.headline_sent_len > 0]['headline_sent_len'].sum()

17620

In [102]:
df_pers[df_pers.summary_sent_len > 0]['summary_sent_len'].describe()

count    11050.000000
mean         1.369140
std          0.726474
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          8.000000
Name: summary_sent_len, dtype: float64

In [103]:
df_pers[df_pers.summary_sent_len > 0]['summary_sent_len'].sum()

15129

In [104]:
df_pers[df_pers.article_sent_len > 0]['article_sent_len'].describe()

count    12233.000000
mean        11.390501
std         14.023226
min          1.000000
25%          5.000000
50%          8.000000
75%         13.000000
max        327.000000
Name: article_sent_len, dtype: float64

In [105]:
df_pers[df_pers.article_sent_len > 0]['article_sent_len'].sum()

139340

### Burse DS Words Stats

In [None]:
#include vocab stats
#as in other analysis
#include number of words contained in article
#plots histograms in parallel

## Finante Pers DS Word Stats

In [None]:
#include vocab stats
#as in other analysis
#include number of words contained in article
#plot histograms in parallel

In [None]:
#calc number of words from title, headline, summary that are found in the article body
#calc stats for references used in recommender system, how man have recoms, how manyu unique recoms