Libraries used:
* bs4 for using beautiful soup in order to parse files.
* re for regular expressions
* os to join file path
* nltk.tokenize for tokenizing raw text
* collections to count word frequency


## 1. Introduction

This project comprises of 2 tasks; this jupyter notebook contains task 2. There are 139 raw text files. Our task here is to generate spare representation for the raw text, create a vocablary for the sparse files and create segment boundaries for the sparse text.

## 2.  Import libraries

In [1]:
#importing libraries
from bs4 import BeautifulSoup as bsoup
import re
import os
import nltk
from nltk.collocations import *
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from collections import Counter

## 3.  Defining file path 

In [2]:
#defining the file paths
stopWords = "./stopwords_en.txt"
theText = "./txt_files/"
segFile = './topic_segs.txt'
vocab = './vocab.txt'

## 4.  Functions

### 4.1  Function to create a set of stop words

In [3]:
#creating set of stop words
#opening the file
s = open(stopWords,'r')

#reading the file line by line
message = s.readlines()

#list to store all the stop words
sW=[]
#using loop to create the list
for items in message:
    items1 = items.strip()
    sW.append(items1)

#Creating the set of stopped words
stopwords_set = set(sW)

### 4.2  Functions to create segment boundaries

In [4]:
#function to create topic_segs vectors
def topicSegs1(file,fileType,stopwordSet):
    
    #Creating the name of he file without .txt
    fileName = file[:-4]
    stars=['**********']
    #joining paths
    file_text = os.path.join(fileType, file)
    
    #reading each line in text file
    f = open(file_text,'r')
    
    #creating list with removed newline chracters
    x=[]
    for lines in f:
        if lines!='**********\n':
            lines=lines.lower()
            tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)?")
            unigram_tokens = tokenizer.tokenize(lines)
            final_tokens = [w for w in unigram_tokens if w not in stopwordSet]
            if len(final_tokens)>0:
                x1=[]
                for items in final_tokens:
                    if len(items)>2:
                        x1.append(items)
                x.append(x1)
            
        else:
            x.append(lines.split())
    
    x.append(stars)
    x1=[]
    for items in x:
        if len(items)>0:
            x1.append(items)
    return x1

#Using the list from the topicsegs1 function to create vector bolean
def topicSegs2(file,fileType,stopwords):
    fileName = file[:-4]
    x= topicSegs1(file,fileType,stopwords)
    t=[]
    for items in x:
        if items!=['**********']:
            t.append(str(0))
        else:
            if len(t)>0:
                t[-1]=str(1)
    t1=",".join(t)
    t2= fileName+':'+t1
    return t2

### 4.3  Functions that help create the vocab

In [5]:
#function to tokenize and remove stop words
def parsingText(file,fileType,stopwords_set):
    
    #joining file path
    file_text = os.path.join(fileType, file)
    f = open(file_text,'r')
    
    #reading the whole file and converting it into a lower case string
    message = f.read().lower()
    
    #tokenzing the string using the regex provided
    tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)?")
    unigram_tokens = tokenizer.tokenize(message)
    
    #creating a list of the tokens
    setTokens=list(set(unigram_tokens))
    
    #removing the stop words
    finalTokens = [w for w in setTokens if w not in stopwords_set]
    
    #returning the file list of tokens
    finalTokens1=[]
    for items in finalTokens:
        if len(items)>2:
            finalTokens1.append(items)
    return finalTokens1

#Empty list to store the list of words and the topic segments
allWords = []


#Running loop over all the files
for file in os.listdir(theText):
    if file!= '.DS_Store':
    #Calling function to get list of words and adding it to another list
        fileWords=parsingText(file,theText,stopwords_set)
        allWords.extend(fileWords)

### 4.4  Final function to create the vocab

In [6]:
#Using counter function to check frequency of word occurance
counts = Counter(allWords)

#loop that adds word who's frequency is more than 132 to list of stopwords
for items in counts:
    
    #Checking if values is greater than 132
    if counts[items]>132:
        stopwords_set.add(items)

#Removing extra words including stop words from the final List of words
final_tokens = [w for w in allWords if w not in stopwords_set]

#creating a set of sorted vocab words
final_tokens= set(final_tokens)
finalVocab = sorted(final_tokens)

#empty dictionary to create dictionary of words
vocabDict={}
i=0

#Adding vocab key values to the dictionary of vocab words
while i < len(finalVocab):
    vocabDict[finalVocab[i]]=i
    i=i+1

In [7]:
#opening vocab file to read in the whole vocab
f = open(vocab,'w')

#writing values from dictionary to the text file
for items in vocabDict:
    vocabIndex=str(items)+':'+str(vocabDict[items])+'\n'
    
    #adding vocab to the text file
    f.write(vocabIndex)
    
#Closing the file once it is done
f.close()

### 4.5  Function to create sparse representation of the raw text

In [8]:
def sparseText(file,fileType):

    file_text = os.path.join(fileType, file)
    f=open(file_text,'r')
    finaLX=[]
    i=0
    for lines in f:
        x=[]
        for words in lines.split():
            words=words.lower()
            tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)?")
            tword = tokenizer.tokenize(words)
            
            for word in tword:
                
                if word in vocabDict:
                    x.append(vocabDict[word])
                    
        i=i+1
        if len(x)>0:
            counts = Counter(x)
            finaLX.append(counts)
    return finaLX

def finalSparse(file, fileType):
    x= sparseText(file,fileType)
    finalList=[]
    for items in x:
        theList=[]
        x1=''
        for key,values in items.items():
            x= str(key)+':'+str(values)
            x1+=x+','
            #print(x)
            x3=x1[:-1]
        theList.append(x3)
        finalList.extend(theList)
    x= '\n'.join(finalList)
    return x
        


for file in os.listdir(theText):
    if file!=".DS_Store":
        finalFile = './sparse_files/'+file[:-4]+'.txt'
        f = open(finalFile,'w')
        f.write(finalSparse(file,theText))
        f.close()

In [9]:
allSegs = []
#Running loop over all the files
for file in os.listdir(theText):
    if file!=".DS_Store":
        fileSegs=topicSegs2(file,theText,stopwords_set)
        allSegs.append(fileSegs)
#creating the file for the topic segments
finalSegs = "\n".join(allSegs)
#opening the file
f = open(segFile,'w')
#adding output to the file
f.write(finalSegs)

#closing the file
f.close()

## 5. Summary

1. In this project task, the input files were from raw text files created in task 1.
2. Several different approaches were used to create the sparse representation of the raw text.
3. The summary of the task is as follows:
    - Created a list that have the vector bolean for the segments of the topic.
    - Created dictionary to store the vocablary.
    - Created sparse representation of raw text after tokenzing it and matching corresponding words to the vocab
4. The final outputs were exported to relavant files.