**Prototype #2**
>This notebook uses text analysus to explore bilingual lexicon borrowing in a corpus composed by 5 volumes of the Estado Magazine between 1942 and 1982. It takes one volume --composed by several magazine numbers-- for each decade. The goal of this notebook is to create a Document-Term-Matrix with 5 rows and 29 columns. The columns are going to be words in the list of anglicism that have been manually selected to explore the corpus. 
>This notebook will not include an attempt to improve the preprocessing in order to separate words smudged together

**Import Libraries**

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [2]:
import PyPDF2
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hernanadasme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hernanadasme/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from collections import Counter
from pathlib import Path

In [48]:
pd.set_option('display.max_columns', None)

**Import pdf file from directory**

>This line of code takes the pdf file from the directory and opens it using the pdf_reader. 

In [3]:
pdf_1943 = open(r'/Users/hernanadasme/Projects/estadio_1940_1980/estadio_pdfs/estadio_43_59_7_05_17_12_1943.pdf', 'rb')
pdf_1952 = open(r'/Users/hernanadasme/Projects/estadio_1940_1980/estadio_pdfs/estadio_468_485_03_05_30_08_1952.pdf', 'rb')
pdf_1962 = open(r'/Users/hernanadasme/Projects/estadio_1940_1980/estadio_pdfs/estadio_986_1001_19_04_02_08_1962.pdf', 'rb')
pdf_1972 = open(r'/Users/hernanadasme/Projects/estadio_1940_1980/estadio_pdfs/estadio_1497_1509_06_04_27_06_1972.pdf', 'rb')
pdf_1982 = open(r'/Users/hernanadasme/Projects/estadio_1940_1980/estadio_pdfs/estadio_2004_2016_05_01_30_03_1982.pdf', 'rb')

In [14]:
pdf_corpus = [pdf_r1943, pdf_r1952, pdf_r1962, pdf_r1972, pdf_r1982]

In [16]:
for pdf in pdf_corpus:
    print(len(pdf.pages))

565
648
780
670
1032


In [8]:
pdf_r1943 = PyPDF2.PdfReader(pdf_1943)
pdf_r1952 = PyPDF2.PdfReader(pdf_1952)
pdf_r1962 = PyPDF2.PdfReader(pdf_1962)
pdf_r1972 = PyPDF2.PdfReader(pdf_1972)
pdf_r1982 = PyPDF2.PdfReader(pdf_1982)

In [9]:
estadio_1943 = ""
estadio_1952 = ""
estadio_1962 = ""
estadio_1972 = ""
estadio_1982 = ""

for page in pdf_r1943.pages:
    estadio_1943 += page.extract_text()  

for page in pdf_r1952.pages:
    estadio_1952 += page.extract_text() 
    
for page in pdf_r1962.pages:
    estadio_1962 += page.extract_text()
    
for page in pdf_r1972.pages:
    estadio_1972 += page.extract_text()

for page in pdf_r1982.pages:
    estadio_1982 += page.extract_text()

In [10]:
estadio_corpus = [estadio_1943, estadio_1952, estadio_1962, estadio_1972, estadio_1982]

**Preprocessing Functions**

In [18]:
def preprocess_text(text):
    # Lowercase the text
    text = text.replace('\n', '')
    pattern_1 = r'[^a-zA-Z0-9\s]'
    # Replace the special characters with an empty string
    text = re.sub(pattern_1, '', text)
    pattern_2 = r'\d+'
    # Replace the digits with an empty string
    text = re.sub(pattern_2, '', text)
    text = text.lower()
    
    return text

In [19]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 0]
    #tokens = [get_lemma(token) for token in tokens]
    return tokens

In [27]:
preprocessed_1943 = preprocess_text(estadio_1943)
preprocessed_1952 = preprocess_text(estadio_1952)
preprocessed_1962 = preprocess_text(estadio_1962)
preprocessed_1972 = preprocess_text(estadio_1972)
preprocessed_1982 = preprocess_text(estadio_1982)

In [28]:
clean_1943 = tokenize(estadio_1943)
clean_1952 = tokenize(estadio_1952)
clean_1962 = tokenize(estadio_1962)
clean_1972 = tokenize(estadio_1972)
clean_1982 = tokenize(estadio_1982)

In [24]:
#creating the list of english terms 
tokens_eng = ['goal','match','Forward','Field','Back','Pitchers','Wing','Shot','shoot','Player','Handicap',
              'Kick','Second','Referee','Insider','Crack','Standard',
              'Jersey','Foul','Knockout','Out','Record','Score','Single','Sport',
              'Shortstop','Training','Centroforward','sprinter']
#lowercasing the terms
tokens_englow = []
for w in tokens_eng:
    word = w.lower()
    tokens_englow.append(word)

In [25]:
#creating the list of spanish terms 
tokens_esplow = ['gol', 'partido','delantero','cancha','defensa','lanzador','lateral','disparo',
                 'disparar','jugador','desventaja','patear','segundo','juez','interior','estrella',
                 'estandar','camiseta','falta','nocaut','fuera','registro','marcador','individual',
                 'deporte','parada','entrenamiento','centrodelantero','velocista']

In [30]:
#counting ESP words in the cleaned txt and creating a dictionary with words and counts
counts_spanish_1943 = {}
counts_spanish_1952 = {}
counts_spanish_1962 = {}
counts_spanish_1972 = {}
counts_spanish_1982 = {}

for token in tokens_esplow:
    count = clean_1943.count(token)
    counts_spanish_1943[token] = count
for token in tokens_esplow:
    count = clean_1952.count(token)
    counts_spanish_1952[token] = count
for token in tokens_esplow:
    count = clean_1962.count(token)
    counts_spanish_1962[token] = count
for token in tokens_esplow:
    count = clean_1972.count(token)
    counts_spanish_1972[token] = count
for token in tokens_esplow:
    count = clean_1982.count(token)
    counts_spanish_1982[token] = count

In [36]:
counts_english_1943 = {}
counts_english_1952 = {}
counts_english_1962 = {}
counts_english_1972 = {}
counts_english_1982 = {}

for token in tokens_englow:
    count = clean_1943.count(token)
    counts_english_1943[token] = count
for token in tokens_englow:
    count = clean_1952.count(token)
    counts_english_1952[token] = count
for token in tokens_englow:
    count = clean_1962.count(token)
    counts_english_1962[token] = count
for token in tokens_englow:
    count = clean_1972.count(token)
    counts_english_1972[token] = count
for token in tokens_englow:
    count = clean_1982.count(token)
    counts_english_1982[token] = count

In [51]:
df_1943_eng = pd.DataFrame(counts_english_1943, index=[1943])
df_1952_eng = pd.DataFrame(counts_english_1952, index=[1952])
df_1962_eng = pd.DataFrame(counts_english_1962, index=[1962])
df_1972_eng = pd.DataFrame(counts_english_1972, index=[1972])
df_1982_eng = pd.DataFrame(counts_english_1982, index=[1982])

In [57]:
df_1943_esp = pd.DataFrame(counts_spanish_1943, index=[1943])
df_1952_esp = pd.DataFrame(counts_spanish_1952, index=[1952])
df_1962_esp = pd.DataFrame(counts_spanish_1962, index=[1962])
df_1972_esp = pd.DataFrame(counts_spanish_1972, index=[1972])
df_1982_esp = pd.DataFrame(counts_spanish_1982, index=[1982])

In [65]:
frames_eng = [df_1943_eng, df_1952_eng, df_1962_eng, df_1972_eng, df_1982_eng]

df_1943_1982_eng = pd.concat(frames_eng)

In [66]:
frames_esp = [df_1943_esp, df_1952_esp, df_1962_esp, df_1972_esp, df_1982_esp]

df_1943_1982_esp = pd.concat(frames_esp)

In [67]:
df_1943_1982_eng

Unnamed: 0,goal,match,forward,field,back,pitchers,wing,shot,shoot,player,handicap,kick,second,referee,insider,crack,standard,jersey,foul,knockout,out,record,score,single,sport,shortstop,training,centroforward,sprinter
1943,1,31,1,2,2,0,0,0,0,2,2,0,0,3,5,12,3,0,1,1,3,12,4,0,0,0,0,1,5
1952,5,20,7,1,2,0,0,0,0,0,1,0,0,3,6,2,1,0,0,0,8,6,1,0,1,0,0,1,1
1962,0,37,5,1,5,0,1,0,0,0,0,0,0,1,3,1,1,0,0,0,2,4,3,0,0,0,0,1,0
1972,0,7,0,0,0,0,2,0,0,0,0,0,0,1,0,1,1,0,2,0,2,15,4,0,0,0,0,0,0
1982,0,18,0,1,1,0,7,0,0,0,0,0,0,0,0,16,0,0,5,0,0,20,1,1,0,0,0,0,0


In [68]:
df_1943_1982_esp

Unnamed: 0,gol,partido,delantero,cancha,defensa,lanzador,lateral,disparo,disparar,jugador,desventaja,patear,segundo,juez,interior,estrella,estandar,camiseta,falta,nocaut,fuera,registro,marcador,individual,deporte,parada,entrenamiento,centrodelantero,velocista
1943,15,43,18,46,43,1,0,0,0,36,2,1,33,0,8,1,0,3,2,0,8,0,5,7,68,1,6,5,3
1952,10,14,7,19,37,1,0,1,0,11,2,0,42,1,4,0,0,2,4,0,5,1,5,33,39,1,1,8,3
1962,16,45,16,21,44,2,3,8,0,26,2,1,26,0,15,3,0,3,1,0,7,1,4,15,22,0,6,18,1
1972,9,50,3,26,13,1,2,3,0,14,1,0,40,0,3,3,0,6,3,0,10,0,10,23,33,3,10,3,2
1982,33,114,29,60,35,2,20,15,1,152,1,1,44,5,4,16,0,8,54,13,97,3,21,23,50,2,6,6,0


**Do the same for spanish**