## Wikipedia Crawl example

Author: J. Hickman

- This code crawls through wikipedia to get a bunch of text data
- The code lets the user specify search category topics.
  - The more different the topics are, the easier the classification will be.
  - For example, i used (pizza, metallurgy, basketball)
- It then searches wikipedia for articles related to these topics
- Loops over the wikipedia pages and gets the text from the wikipedia pages
- Breaks the text into chunks (based on a user input specifying the number of sentences per chunk)
- Each chunk is cleaned and tagged with a "label" (classification) and a numeric "sentiment score" (regression)
- These cleaned chunks form a corpus of strings with associated tags

```
python -m pip install wikipedia_sections
```

### Import

In [1]:
# conda install -c conda-forge wikipedia
# conda install -c conda-forge wordcloud
# python -m pip install wikipedia_sections

import wikipedia
import nltk
import string 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


In [2]:
# RUN THE FOLLOWING IF YOU HAVEN'T DOWNLOADED THESE BEFORE
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Set user parameters 

In [3]:
# PARAMETERS 
label_list=['electric vehicle','gasoline vehicle','hybrid vehicle']
max_num_pages=25
sentence_per_chunk=5
min_sentence_length=20

# GET STOPWORDS
# from nltk.corpus import stopwords
stop_words=nltk.corpus.stopwords.words('english')

# INITALIZE STEMMER+LEMITZIZER+SIA
sia = SentimentIntensityAnalyzer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

### Define text cleaning function

In [4]:
def clean_string(text):
	# #FILTER OUT UNWANTED CHAR
	new_text=""
	# keep=string.printable
	keep=" abcdefghijklmnopqrstuvwxyz0123456789"
	for character in text:
		if character.lower() in keep:
			new_text+=character.lower()
		else: 
			new_text+=" "
	text=new_text
	# print(text)

	# #FILTER OUT UNWANTED WORDS
	new_text=""
	for word in nltk.tokenize.word_tokenize(text):
		if word not in nltk.corpus.stopwords.words('english'):
			#lemmatize 
			tmp=lemmatizer.lemmatize(word)
			# tmp=stemmer.stem(tmp)

			# update word if there is a change
			# if(tmp!=word): print(tmp,word)
			
			word=tmp
			if len(word)>1:
				if word in [".",",","!","?",":",";"]:
					#remove the last space
					new_text=new_text[0:-1]+word+" "
				else: #add a space
					new_text+=word.lower()+" "
	text=new_text.strip()
	return text

# clean_string('the word "pizza" first appeared in a Latin text from the town of Gaeta, then still part of the Byzantine Empire, in 997 AD; the text states that a tenant of certain property is to give the bishop of Gaeta duodecim pizze ("twelve pizzas") every Christmas Day, and another twelve every Easter Sunday.Suggested etymologies include:')


### Preform a wikipedia crawl


In [5]:
#INITIALIZE 
corpus=[]  # list of strings (input variables X)
targets=[] # list of targets (labels or response variables Y)

#--------------------------
# LOOP OVER TOPICS 
#--------------------------
for label in label_list:

	#SEARCH FOR RELEVANT PAGES 
	titles=wikipedia.search(label,results=max_num_pages)
	print("Pages for label =",label,":",titles)

	#LOOP OVER WIKI-PAGES
	for title in titles:
		try:
			print("	",title)
			wiki_page = wikipedia.page(title, auto_suggest=True)

			# LOOP OVER SECTIONS IN ARTICLE AND GET PAGE TEXT
			for section in wiki_page.sections:
				text=wiki_page.section(section); #print(text)

				#BREAK IN TO SENTANCES 
				sentences=nltk.tokenize.sent_tokenize(text)
				counter=0
				text_chunk=''

				#LOOP OVER SENTENCES 
				for sentence in sentences:
					if len(sentence)>min_sentence_length:
						if(counter%sentence_per_chunk==0 and counter!=0):
							# PROCESS COMPLETED CHUNK 
							
							# CLEAN STRING
							text_chunk=clean_string(text_chunk)

							# REMOVE LABEL IF IN STRING (MAKES IT TOO EASY)
							text_chunk=text_chunk.replace(label,"")
							
							# REMOVE ANY DOUBLE SPACES
							text_chunk=' '.join(text_chunk.split()).strip()

							#UPDATE CORPUS 
							corpus.append(text_chunk)

							#UPDATE TARGETS
							score=sia.polarity_scores(text_chunk)
							target=[label,score['compound']]
							targets.append(target)

							#print("TEXT\n",text_chunk,target)

							# RESET CHUNK FOR NEXT ITERATION 
							text_chunk=sentence
						else:
							text_chunk+=sentence
						#print("--------\n", sentence)
						counter+=1

		except:
			print("WARNING: SOMETHING WENT WRONG:", title);  


	 Electric vehicle
	 History of the electric vehicle
	 Battery electric vehicle
	 Electric vehicle battery
	 Hybrid electric vehicle
	 Electric car use by country
	 Plug-in electric vehicle
	 List of production battery electric vehicles
	 Neighborhood Electric Vehicle
	 Hybrid vehicle drivetrain
	 Aptera (solar electric vehicle)
	 Citroën Ami (electric vehicle)
	 Electric car
	 Electric vehicle conversion
	 Hybrid vehicle
	 Plug-in hybrid
	 Capacitor electric vehicle
	 Charging station
	 Grumman LLV
	 London Electric Vehicle Company
	 Electric vehicle industry in China
	 Fuel cell vehicle
	 Plug-in electric vehicles in China
	 Electric Vehicle Company
Pages for label = gasoline vehicle : ['Petrol engine', 'Flexible-fuel vehicle', 'Gasoline', 'Alternative fuel vehicle', 'Electric vehicle', 'Miles per gallon gasoline equivalent', 'Hybrid electric vehicle', 'Natural gas vehicle', 'Bi-fuel vehicle', 'Gasoline pump', 'Common ethanol fuel mixtures', 'Hydrogen internal combustion engine vehic

### Save results

In [6]:
#SANITY CHECKS AND PRINT TO FILE 
print("number of text chunks = ",len(corpus))
print("number of targets = ",len(targets))

tmp=[]
for i in range(0,len(corpus)):
    tmp.append([corpus[i],targets[i][0],targets[i][1]])
df=pd.DataFrame(tmp)
df=df.rename(columns={0: "text", 1: "label", 2: "sentiment"})
print(df)
df.to_csv('ev-wiki-crawl-results.csv',index=False)

number of text chunks =  1173
number of targets =  1173
                                                   text             label  \
0     electric motive power started 1827 hungarian p...  electric vehicle   
1     first mass produced appeared america early 190...  electric vehicle   
2     20th century uk world largest user electric ro...  electric vehicle   
3     1900 28 percent car road electric ev popular e...  electric vehicle   
4     seldom marketed woman luxury car may stigma am...  electric vehicle   
...                                                 ...               ...   
1168  best known best selling steam powered car stan...    hybrid vehicle   
1169  wind powered vehicle well known long time real...    hybrid vehicle   
1170  wood gas used power car ordinary internal comb...    hybrid vehicle   
1171  hybrid air vehicle formed 2007 roger munk jeff...    hybrid vehicle   
1172  hav 304 developed military lemv project follow...    hybrid vehicle   

      sentiment  
0