In [1]:
!conda install --yes --quiet pymongo




# All requested packages already installed.
# packages in environment at /opt/conda:
#
pymongo                   3.4.0                    py36_0    defaults


In [90]:
import pymongo
import sys
import warnings
warnings.filterwarnings("ignore")

### setting up Wikipedia API

In [54]:
import re
import requests
import pandas as pd
import numpy as np

In [6]:
r = requests.get('http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max')

In [7]:
type(r)

requests.models.Response

In [8]:
r.status_code # 200 means it worked!

200

In [9]:
r.json().keys()  #converets to json and asks for keys, same as python keys

dict_keys(['batchcomplete', 'limits', 'query'])

In [10]:
r.json()['query'].keys()  ##checking value within JSON file

dict_keys(['categorymembers'])

In [146]:
cat_df = pd.DataFrame(r.json()['query']['categorymembers'])
cat_df.tail()

Unnamed: 0,ns,pageid,title
225,14,11737376,Category:Statistical natural language processing
226,14,40149461,Category:Structured prediction
227,14,52763867,Category:Supervised learning
228,14,31176997,Category:Support vector machines
229,14,52763828,Category:Unsupervised learning


In [147]:
cat_df.to_pickle('../pickle_jar/cat_df.p')

In [20]:
cat_df = pd.read_pickle('../pickle_jar/cat_df.p')

### creating function for searching topics

In [228]:
def search(phrase):
    phrase = re.sub('\s', '+', phrase)
    query = 'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+{}&cmlimit=max'.format(phrase)
    QR = requests.get(query)
    return(pd.DataFrame(QR.json()['query']['categorymembers']))





#### Preliminary search for Category:Machine learning

In [None]:
search('Machine learning')

### Preliminary search of 'Machine Learning'

In [148]:
search('Bayesian networks')

Unnamed: 0,ns,pageid,title
0,0,203996,Bayesian network
1,0,42734031,Bayesian hierarchical modeling
2,0,1169985,Causal Markov condition
3,0,1242713,Dynamic Bayesian network
4,0,1194259,Influence diagram
5,0,4855682,Junction tree algorithm
6,0,2649330,Latent variable
7,0,1169984,Markov blanket
8,0,4855451,Moral graph
9,0,15882673,Plate notation


#### Preliminary search for Category:Business software

In [None]:
search('Business software')


### Creating recursive function

In [230]:
def search_recursive(phrase, max_depth=2):

    
        
    if max_depth > 0:  ### max_depth defaults to 2, user input can take it higher
        
        cat2_df = search(phrase)  ###preliminary search to get initial pages and `Step 2` Subcategories
#         print('depth: ', max_depth, ' category: ', phrase)
        
        cat_mask = cat2_df['title'].str.contains('Category:')  ### creating mask for all Subcategories
        
        pages_df = cat2_df[~cat_mask]  ### creating a DF where only pages exist; no Subcategories
        
        newlist = []  ### new local list vaariable
        newlist.append(pages_df)  ### appending empty list with all pages, no Subcategories
        
        categories = cat2_df[cat_mask]['title'].str.replace('Category:', '').tolist() 
        ### any and all Subcategories get turned to a temporary list
        
        if len(categories) > 0:  ### if there are any sub-categories, this if statement starts
            for i in categories:  ### iterates through the subcategories list
                
                max_depth -= 1   ### looks one level deeper into each subcategory
                newlist.append(search_recursive(i, max_depth)) ### extracts all pages & subcategories from this deeper page
                max_depth += 1  ### returns back to the current page
        
        pages_df = pd.concat(newlist)  #adding newlist to pages_df
        pages_df.reset_index() ### attempting to create an additive index, doesnt appear to work though
       
        return pages_df

    else:  ### if Category or Subcategory has no further subcategories, no further steps will be taken
        return

#### Using recursive search function to find all articles in "Category:Machine learning"

In [231]:
machine_learning_md3_df = search_recursive("Machine learning", 3)
machine_learning_md3_df.shape

(1532, 3)

#### 1532 pages!!!  If `"Category:Machine learning"` is step 1, we go to step 3 to determine this quantity. 

In [232]:
machine_learning_md3_df.head()

###indices seem to be not correct.  Does it really matter?  We are interested in pageids.

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,3771060,Accuracy paradox


#### Using recursive search function to find all articles in "Category:Business software"

In [246]:
business_software_md3_df = search_recursive("Business software", 3)
business_software_md3_df.shape

(4121, 3)

#### 4121 pages!!!  If `"Category:Business software"` is step 1, we go to step 3 to determine this quantity. 

In [248]:

business_software_md3_df.head()

Unnamed: 0,ns,pageid,title
0,0,1037763,Business software
1,0,41270069,AccuSystems
2,0,5211212,Active policy management
3,0,28502793,Alexandria (library software)
4,0,44133735,Alteryx


In [247]:
business_software_md3_df.tail()

Unnamed: 0,ns,pageid,title
39,0,1577008,Ted (word processor)
40,0,37628014,Thorn EMI Liberator
41,0,34306381,Trelby
42,0,29902828,Word Juggler
43,0,826279,WordMARC


### Creating function to extract all literature from pages

In [63]:

from string import punctuation

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_page_contents(pageid):
    
    query = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&\
             rvprop=content&rvsection=0&format=json&pageids={}'.format(pageid)
    
    my_request = requests.get(query).json()
    
    no_html_string = striphtml(my_request['query']['pages'][str(pageid)]['extract']).replace('\n', ' ')
    
    return strip_punctuation(no_html_string)

In [165]:
requests.get('http://en.wikipedia.org/w/api.php?action=query&prop=extracts&rvprop=content&rvsection=0&format=json&pageids=233497').json()

### returns a non-clean version

{'batchcomplete': '',
 'query': {'pages': {'233497': {'extract': '<p><b>Unsupervised machine learning</b> is the machine learning task of inferring a function to describe hidden structure from "unlabeled" data (a classification or categorization is not included in the observations). Since the examples given to the learner are unlabeled, there is no evaluation of the accuracy of the structure that is output by the relevant algorithm—which is one way of distinguishing unsupervised learning from supervised learning and reinforcement learning.</p>\n<p>A central case of unsupervised learning is the problem of density estimation in statistics, though unsupervised learning encompasses many other problems (and solutions) involving summarizing and explaining key features of the data.</p>\n<p>Approaches to unsupervised learning include:</p>\n<ul><li>clustering\n<ul><li>k-means</li>\n<li>mixture models</li>\n<li>hierarchical clustering,</li>\n</ul></li>\n<li>anomaly detection</li>\n<li>Neural Net

In [68]:
get_page_contents(826279)

### returns the cleaned version as per our function

'WordMARC was a scientifically oriented word processor developed by MARC Software an offshoot of MARC Analysis Research Corporation which specialized in high end Finite Element Analysis software for mechanical engineering It ran originally on minicomputers such as Prime and Digital Equipment Corporation VAX When the IBM PC emerged as the platform of choice for word processing WordMARC allowed users to easily move documents from a minicomputer where they could be easily shared to PCs WordMARC was the creation of Pedro Marcal who pioneered work in finite element analysis and needed a technical word processor that both supported complex notations and was capable of running on minicomputers and other highend machines such an Alliant and ATampT WordMARC was originally known as MUSE MARC Universal Screen Editor but the name was changed because of a trademark conflict with another company when the product was ported to the IBM PC In comparison with WordPerfect WordMARCs codes were always hidd

In [234]:
# Cleaning up the index for the dataframe.  I think it was messsin up the 

index_list = []
for i in range(1532):
    index_list.append(i)

# machine_learning_md3_df.reset_index(index_list)
# machine_learning_md3_df.drop(index, axis=1, inplace=True)
machine_learning_md3_df['Category'] = 'Machine Learning'
machine_learning_md3_df.index = index_list


In [252]:
#cleaning up the index for the Business Software Dataframe

bs_index_list = []
for i in range(4121):
    bs_index_list.append(i)
    
business_software_md3_df.index = bs_index_list
business_software_md3_df['Category'] = 'Business Software'
business_software_md3_df.tail()

Unnamed: 0,ns,pageid,title,Category
4116,0,1577008,Ted (word processor),Business Software
4117,0,37628014,Thorn EMI Liberator,Business Software
4118,0,34306381,Trelby,Business Software
4119,0,29902828,Word Juggler,Business Software
4120,0,826279,WordMARC,Business Software


#### Filling our machine_learning dataframe with text for each article.

In [236]:
machine_learning_md3_df.tail()

Unnamed: 0,ns,pageid,title,Category
1527,0,11273721,Hierarchical temporal memory,Machine Learning
1528,0,33742232,Restricted Boltzmann machine,Machine Learning
1529,0,76996,Self-organizing map,Machine Learning
1530,0,48813654,Sparse dictionary learning,Machine Learning
1531,0,47805,Vector quantization,Machine Learning


#### For loop to populate our Dataframe with articles

In [239]:
### Testing how to input unique contents into each row.
# machine_learning_md3_df['Text'][0] = get_page_contents(43385931)
# machine_learning_md3_df['Text'][1] = get_page_contents(49082762)
# machine_learning_md3_df['Text'][2] = get_page_contents(233488)


### only uncomment the next line if you want to reset all text 
# machine_learning_md3_df['Text'] = None

for i,j in enumerate(machine_learning_md3_df['pageid']):
#     print(i,j)
    machine_learning_md3_df['Text'][i] = get_page_contents(j)
    
    
    ### use this for loop and get_page_contents to extract all page contents. 

In [242]:
# machine_learning_md3_df['Text'][177] = get_page_contents(1579244)
machine_learning_md3_df.head()

Unnamed: 0,ns,pageid,title,Category,Text
0,0,43385931,Data exploration,Machine Learning,Data exploration is an approach similar to ini...
1,0,49082762,List of datasets for machine learning research,Machine Learning,These datasets are used for machinelearning re...
2,0,233488,Machine learning,Machine Learning,Machine learning is a field of computer scienc...
3,0,53587467,Outline of machine learning,Machine Learning,The following outline is provided as an overvi...
4,0,3771060,Accuracy paradox,Machine Learning,The accuracy paradox for predictive analytics ...


In [243]:
machine_learning_md3_df.tail()

Unnamed: 0,ns,pageid,title,Category,Text
1527,0,11273721,Hierarchical temporal memory,Machine Learning,Hierarchical temporal memory HTM is a biologic...
1528,0,33742232,Restricted Boltzmann machine,Machine Learning,A restricted Boltzmann machine RBM is a genera...
1529,0,76996,Self-organizing map,Machine Learning,A selforganizing map SOM or selforganizing fea...
1530,0,48813654,Sparse dictionary learning,Machine Learning,Sparse dictionary learning is a representation...
1531,0,47805,Vector quantization,Machine Learning,Vector quantization VQ is a classical quantiza...


#### Pickling the final machine learning Dataframe, with articles included

In [241]:
machine_learning_md3_df.to_pickle('../pickle_jar/machine_learning_md3_df.p')

In [55]:
machine_learning_md3_df = pd.read_pickle('../pickle_jar/machine_learning_md3_df.p')

In [302]:
ml_df1 = machine_learning_md3_df.copy()
ml_df2 = machine_learning_md3_df.copy()

###Since dict is too large to send to MongoDB, we will break into 2 separate DF's

# ml_df1.drop([i for i in range(1000,1531)], axis=0, inplace=True)
# ml_df2.drop([i for i in range(0,1000)], axis=0, inplace=True)

ml_df1.shape, ml_df2.shape


((1001, 5), (532, 5))

In [303]:
ml_dict1 = ml_df1.to_dict('list')
ml_dict2 = ml_df2.to_dict('list')

#### filling our Business Software dataframe with articles

In [256]:
### only uncomment the next line if you want to reset all text 
# business_software_md3_df['Text'] = None

for i,j in enumerate(business_software_md3_df['pageid']):
#     print(i,j)
    business_software_md3_df['Text'][i] = get_page_contents(j)
    
    
    ### Returned 3684 observations
    ### use this for loop and get_page_contents to extract all page contents. 

In [308]:
business_software_md3_df.head()

(4121, 5)

In [318]:
business_software_md3_df.tail()

Unnamed: 0,ns,pageid,title,Category,Text
4116,0,1577008,Ted (word processor),Business Software,Ted is a word processor for the X Window Syste...
4117,0,37628014,Thorn EMI Liberator,Business Software,The Thorn EMI Liberator was a laptop word proc...
4118,0,34306381,Trelby,Business Software,Trelby is a free and open source screenwriting...
4119,0,29902828,Word Juggler,Business Software,Word Juggler was a word processor application ...
4120,0,826279,WordMARC,Business Software,WordMARC was a scientifically oriented word pr...


In [259]:
business_software_md3_df.to_pickle('../pickle_jar/business_software_md3_df.p')

###this DF contains ~4120 articles.

In [56]:
business_software_md3_df = pd.read_pickle('../pickle_jar/business_software_md3_df.p')

In [314]:
bs_dict = business_software_md3_df.to_dict('list')

### Setting up our Mongo database

In [35]:
client = pymongo.MongoClient('34.214.191.95', 27016)

In [46]:
client.database_names(), db_ref.collection_names()

(['admin', 'local', 'my_database', 'test', 'twitter'], ['my_collection'])

#### Creating and confirming that `Machine` exists

In [36]:
client['Machine']

###we see this is now a database

Database(MongoClient(host=['34.214.191.95:27016'], document_class=dict, tz_aware=False, connect=True), 'Machine')

#### Setting up reference to `Machine`

In [47]:
db_ref = client.Machine

In [48]:
db_ref

### confirming that machine is the 

Database(MongoClient(host=['34.214.191.95:27016'], document_class=dict, tz_aware=False, connect=True), 'Machine')

#### setting up `coll_ref` to `my_collection`

In [49]:
coll_ref = db_ref.my_collection

#### creating and inserting a sample_doc

In [41]:
sample_doc = {"name":"Kevin", "message":"Test", 'my_array' :[1,2,3,4,5,6,7,9]}

In [315]:
### Inserting 2 separate dictionaries for ML & 1 dict for BS

# coll_ref.insert_one(ml_dict1)
# coll_ref.insert_one(ml_dict2)
# coll_ref.insert_one(bs_dict)

<pymongo.results.InsertOneResult at 0x7f4502721ee8>

#### confirming that our new database, `machine`, exists

In [306]:
client.database_names(), db_ref.collection_names()

### we see it here!!!

(['Machine', 'admin', 'local', 'my_database', 'test', 'twitter'],
 ['my_collection'])

### create a cursor

In [20]:
cursor = coll_ref.find()

In [21]:
cursor

<pymongo.cursor.Cursor at 0x7f4534948908>

In [None]:
cursor = coll_ref.find()

cursor

sample_docs = list(cursor)

In [317]:
coll_ref.count()

### 3 includes 1 sample doc and 2 ML dictionaries, and 1 BS dictionary

4

In [None]:
1) # Create new database

2)# Create variable that references wikipedia db
# business_db = db_ref.business
# machine_db = db_ref.machine

# OR 

# wiki_db = db_ref.wiki


