<a href="https://colab.research.google.com/github/karolinakuligowska/census-app_bez_polskich_liter/blob/main/KK_TMSMM_class_2_DONE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP - Text Parsing, Stemming, Stopword removal, Term Frequency Matrix

In [1]:
# load packages

import re
import string
import pandas as pd
import os
import nltk

In [2]:
import nltk.corpus
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
nltk.download('punkt')

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
# read a file MLK_speech.txt
text="/@@@111Faculty      of Economic Sciences,,,, as an independent unit of the University of Warsaw, affirms its commitment to basic goals and values specified in the Mission Statement of the University of Warsaw. In regard to the way in which the mission of our Alma Mater refers to the discipline represented by the Faculty of Economic Sciences, we define the following goal and value as our priorities of special importance: unity of research and teaching is the foundation of the activities at the Faculty of Economic Sciences."


In [12]:
print(text)

/@@@111Faculty      of Economic Sciences,,,, as an independent unit of the University of Warsaw, affirms its commitment to basic goals and values specified in the Mission Statement of the University of Warsaw. In regard to the way in which the mission of our Alma Mater refers to the discipline represented by the Faculty of Economic Sciences, we define the following goal and value as our priorities of special importance: unity of research and teaching is the foundation of the activities at the Faculty of Economic Sciences.


# Text parsing 
--------------------------


# Preliminary cleaning

In [13]:
# replace special characters from text. 
# substituting "/", "@" and "|" and others by a space.
text_clean = re.sub('[^a-zA-Z0-9 \n\.]', '', text)
print(text_clean)

111Faculty      of Economic Sciences as an independent unit of the University of Warsaw affirms its commitment to basic goals and values specified in the Mission Statement of the University of Warsaw. In regard to the way in which the mission of our Alma Mater refers to the discipline represented by the Faculty of Economic Sciences we define the following goal and value as our priorities of special importance unity of research and teaching is the foundation of the activities at the Faculty of Economic Sciences.


# Cleaning text


In [14]:
# a) to remove unnecessary spaces, punctuation and numbers

# remove unnecessary spaces
text_cleaner = re.sub(' +', ' ', text_clean)

# remove unnecessary punctuation - already done above using regex, you may try to define punctuation manually
text_cleaner = re.sub(r'[^\w\s]','', text_cleaner)

In [15]:
# remove unnecessary numbers
text_cleaner = re.sub('\d', '', text_cleaner)

# remove unnecessary spaces once again
text_cleaner = re.sub(' +', ' ', text_cleaner)
print(text_cleaner)

Faculty of Economic Sciences as an independent unit of the University of Warsaw affirms its commitment to basic goals and values specified in the Mission Statement of the University of Warsaw In regard to the way in which the mission of our Alma Mater refers to the discipline represented by the Faculty of Economic Sciences we define the following goal and value as our priorities of special importance unity of research and teaching is the foundation of the activities at the Faculty of Economic Sciences


In [16]:
# b) change letters to lower case

text_cleaner = text_cleaner.lower()

# change to lowercase
print(text_cleaner.lower())

faculty of economic sciences as an independent unit of the university of warsaw affirms its commitment to basic goals and values specified in the mission statement of the university of warsaw in regard to the way in which the mission of our alma mater refers to the discipline represented by the faculty of economic sciences we define the following goal and value as our priorities of special importance unity of research and teaching is the foundation of the activities at the faculty of economic sciences


# Tokenization

In [22]:
word_tokens = word_tokenize(text_cleaner)

print(word_tokens)

['faculty', 'of', 'economic', 'sciences', 'as', 'an', 'independent', 'unit', 'of', 'the', 'university', 'of', 'warsaw', 'affirms', 'its', 'commitment', 'to', 'basic', 'goals', 'and', 'values', 'specified', 'in', 'the', 'mission', 'statement', 'of', 'the', 'university', 'of', 'warsaw', 'in', 'regard', 'to', 'the', 'way', 'in', 'which', 'the', 'mission', 'of', 'our', 'alma', 'mater', 'refers', 'to', 'the', 'discipline', 'represented', 'by', 'the', 'faculty', 'of', 'economic', 'sciences', 'we', 'define', 'the', 'following', 'goal', 'and', 'value', 'as', 'our', 'priorities', 'of', 'special', 'importance', 'unity', 'of', 'research', 'and', 'teaching', 'is', 'the', 'foundation', 'of', 'the', 'activities', 'at', 'the', 'faculty', 'of', 'economic', 'sciences']


# Stopword removal 

In [23]:
# remove English stopwords
stop_words = set(stopwords.words('english')) 
 
filtered_text = [w for w in word_tokens if not w in stop_words] 
filtered_text = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_text.append(w) 

In [24]:
print(filtered_text) 

['faculty', 'economic', 'sciences', 'independent', 'unit', 'university', 'warsaw', 'affirms', 'commitment', 'basic', 'goals', 'values', 'specified', 'mission', 'statement', 'university', 'warsaw', 'regard', 'way', 'mission', 'alma', 'mater', 'refers', 'discipline', 'represented', 'faculty', 'economic', 'sciences', 'define', 'following', 'goal', 'value', 'priorities', 'special', 'importance', 'unity', 'research', 'teaching', 'foundation', 'activities', 'faculty', 'economic', 'sciences']


In [31]:
# if necessaary: remove your own stopwords - as a vector of words:
stop_words_lst = ['a', 'sciences', 'unit', 'affirms', 'basic']


filtered_text2 = [w for w in filtered_text if not w in stop_words_lst] 
filtered_text2 = [] 
  
for w in filtered_text: 
    if w not in stop_words_lst: 
        filtered_text2.append(w) 


In [32]:
print(filtered_text2)

['faculty', 'economic', 'independent', 'university', 'warsaw', 'commitment', 'goals', 'values', 'specified', 'mission', 'statement', 'university', 'warsaw', 'regard', 'way', 'mission', 'alma', 'mater', 'refers', 'discipline', 'represented', 'faculty', 'economic', 'define', 'following', 'goal', 'value', 'priorities', 'special', 'importance', 'unity', 'research', 'teaching', 'foundation', 'activities', 'faculty', 'economic']


# Stemming 

In [34]:
# Stemming reduces words to their root form
# For example, the reduction of words "move", "moved" 
# and "movement" to the core "move".

# stem document
ps = PorterStemmer() 
   
   
for w in filtered_text2: 
    print(w, " : ", ps.stem(w)) 

faculty  :  faculti
economic  :  econom
independent  :  independ
university  :  univers
warsaw  :  warsaw
commitment  :  commit
goals  :  goal
values  :  valu
specified  :  specifi
mission  :  mission
statement  :  statement
university  :  univers
warsaw  :  warsaw
regard  :  regard
way  :  way
mission  :  mission
alma  :  alma
mater  :  mater
refers  :  refer
discipline  :  disciplin
represented  :  repres
faculty  :  faculti
economic  :  econom
define  :  defin
following  :  follow
goal  :  goal
value  :  valu
priorities  :  prioriti
special  :  special
importance  :  import
unity  :  uniti
research  :  research
teaching  :  teach
foundation  :  foundat
activities  :  activ
faculty  :  faculti
economic  :  econom


# Term Frequency Matrix

In [42]:
# Term frequency matrix

wordlist = filtered_text2


wordfreq = []
for w in wordlist:
    wordfreq.append(wordlist.count(w))

print("Original string\n" + text_cleaner +"\n")
print("List\n" + str(wordlist) + "\n")
print("Frequencies\n" + str(wordfreq) + "\n")
print("Pairs\n" + str(list(zip(wordlist, wordfreq))))

String
faculty of economic sciences as an independent unit of the university of warsaw affirms its commitment to basic goals and values specified in the mission statement of the university of warsaw in regard to the way in which the mission of our alma mater refers to the discipline represented by the faculty of economic sciences we define the following goal and value as our priorities of special importance unity of research and teaching is the foundation of the activities at the faculty of economic sciences

List
['faculty', 'economic', 'independent', 'university', 'warsaw', 'commitment', 'goals', 'values', 'specified', 'mission', 'statement', 'university', 'warsaw', 'regard', 'way', 'mission', 'alma', 'mater', 'refers', 'discipline', 'represented', 'faculty', 'economic', 'define', 'following', 'goal', 'value', 'priorities', 'special', 'importance', 'unity', 'research', 'teaching', 'foundation', 'activities', 'faculty', 'economic']

Frequencies
[3, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 