**README**
- This notebook includes V1 of the discovery pipeline, focused on twitter. 
- It covers evaluating twitter search results for relevance, scraping text from the URLs of suggested relevant results, and then evaluating those results for relevance before returning a final results list for a user to check. 
- Predictions are run against relevant training sets (twitter, descriptions).
- Once this flow has been tested the aim is to rebuild it with all key functions as external python files so that the user would only have to manipulate the necessary inputs at every step to get a result. 

TO DO:
- Export twitter training to pkl to load faster
- Add sites to blacklist before scraping
- Add values to remove after scraping e.g 404 
- Integrate twitter search

Step 1:
- Load libraries and functions 

In [1]:
#imports + path
from __future__ import print_function
import requests
import pandas as pd
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 100)
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

In [2]:
#PREDICTION FUNCTION W/ and W/O LOGREGCV 

def lr_model_predict_cv(t_input, t_feature, target, cv_int, score_type, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

def lr_model_predict(t_input, t_feature, target, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

In [3]:
#scrape URLs for title, desc and URL text 

def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    for link in link_list:
        URL = link
        try:
            page = requests.get(URL)
        except requests.exceptions.ConnectionError:
            pass
        except Exception:
            continue
        try:
            soup = BeautifulSoup(page.content, "html.parser")
            if soup and soup.find('head') and soup.find('body') is not None:
                title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip()
                text = ' '.join([p.text for p in soup.find('body').find_all('p')]).strip()
                new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
                links = links.append(new_row, ignore_index=True)
        except AssertionError:
            pass
    return links

Step 2:
- load training and prediction sets 
- twitter training sets are taken from bigram searches ran for the whole of 2021. Positive sets are bigrams, filtered by language to remove non english results for now and sampled to even out the two halves of the set 
- prediction sets are taken from bigrams searches during the first 3 months of 2022. 

In [4]:
#description training set 
training_set_even_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/trainingset_even_extended.pkl')
new_training_set = pd.read_pickle(path+'LOGREG_RELEVANCE/new_training_set.pkl')

In [5]:
#negative twitter training set
dh = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/digital_humanities_2021.pkl')
music_company = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_company_2021.pkl')
twitter_neg = pd.concat([dh, music_company])
twitter_neg = twitter_neg.loc[twitter_neg['lang'] == 'en']
twitter_neg['Target'] = '0'
twitter_neg = twitter_neg.sample(n=4379, random_state=56)
twitter_neg = twitter_neg[['tweet', 'Target']].reset_index(drop=True)

In [6]:
#positive twitter training set 
music_collection = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_collection.pkl')
song_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_song_dataset.pkl')
sound_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_sound_archive.pkl')
digital_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_archive.pkl')
music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_archive.pkl')
digi_music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_music_archive.pkl')
midi_file = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_midi_file.pkl')
music_data = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_data.pkl')
music_research = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_music_research.pkl')
music_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_dataset.pkl')
twitter_pos = pd.concat([sound_archive, music_collection, digital_archive, music_archive, song_dataset, digi_music_archive, midi_file, music_data, music_research, music_dataset])
twitter_pos = twitter_pos.loc[twitter_pos['lang'] == 'en']
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']].reset_index(drop=True)

In [7]:
#final twitter training set
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)

In [23]:
#load a prediction set 
prediction_twitter = pd.read_pickle(path+'TWITTER_SEARCHES/PREDICTIONS/digital_archive_22.pkl')
prediction_twitter = prediction_twitter.loc[prediction_twitter['lang'] == 'en']

In [17]:
len(prediction_twitter)

467

Step 3:
- run the predict function for twitter (logregcv and logreg options)
- filter the results by positives and optionally by inclusion of the 'music' kw
- return a df w/ tweet, prediction value, confidence score, probability, length of input and url 

In [9]:
#variable for removing unwanted results 
discard = ['youtu', '404', 'Not Found', 'bandcamp']

In [24]:
#run w/ LogRegCV
tweet_predict_cv = lr_model_predict_cv(twitter_set, 'tweet', 'Target', 2, 'precision', prediction_twitter, 'tweet', 'twitter_test_cv', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [26]:
#filter and display results 
tweet_predict_cv_df = tweet_predict_cv.copy()
tweet_predict_cv_df = tweet_predict_cv_df.loc[tweet_predict_cv_df['Prediction'] == 1]
tweet_predict_cv_df = tweet_predict_cv_df[~tweet_predict_cv_df.url.str.contains('|'.join(discard))]
tweet_predict_cv_df = tweet_predict_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_cv_df = tweet_predict_cv_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_cv_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,Digital Archive of the Guatemalan National Pol...,1,23.931470,"[-23.931468435606952, -4.0429215531752186e-11]",105,https://ahpn.lib.utexas.edu/
1,The Bake and Jairazbhoy Digital Archive of #So...,1,23.758199,"[-23.758196714332918, -4.8077986037943225e-11]",303,https://bit.ly/ucla-bake-jairazbhoy
2,The Digital Archive by ⁦⁦@neueprojects⁩ and ⁦@...,1,23.175183,"[-23.175184050042912, -8.612821567426466e-11]",264,https://jingculturecommerce.com/the-digital-ar...
3,I have been running a digital archive of Syria...,1,22.983262,"[-22.98326329256115, -1.043507502683827e-10]",288,http://www.instagram.com/syriabefore2011
4,"""Determining the value of a digital archive. T...",1,22.406905,"[-22.40690473104964, -1.8569701333907e-10]",120,https://researchgate.net/publication/358964775...
...,...,...,...,...,...,...
887,""" ‘You’ve had these since 1948, and you’ve kep...",1,0.248499,"[-0.8250956879030305, -0.5765970124546717]",277,https://www.thej.ca/2022/01/20/a-lost-jewish-h...
888,@RBrookhiser I'd suggest looking at the notes ...,1,0.230686,"[-0.8151275544725489, -0.5844414330397312]",113,https://www.marshallfoundation.org/library/dig...
889,"Written by @KateHolterhoff, this #blogpost dis...",1,0.207950,"[-0.8025176439267384, -0.5945680525536541]",301,https://jvc.oup.com/2013/10/21/ethics-and-the-...
890,@elliecohanim @LeeSmithDC @AmandaMilius @PATPm...,1,0.080514,"[-0.7342145317045646, -0.653700037788195]",338,https://www.marshallfoundation.org/library/dig...


In [28]:
#optional filter by kw
substring = 'music'
mask = tweet_predict_cv_df.applymap(lambda x: substring in x.lower() if isinstance(x,str) else False).to_numpy()
tweet_predict_cv_df_kw = tweet_predict_cv_df.loc[mask] 
tweet_predict_cv_df_kw = tweet_predict_cv_df_kw.reset_index(drop=True)
tweet_predict_cv_df_kw 

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,The Bake and Jairazbhoy Digital Archive of #So...,1,23.758199,"[-23.758196714332918, -4.8077986037943225e-11]",303,https://bit.ly/ucla-bake-jairazbhoy
1,"From the archive: Charles Simic's ""Little Nigh...",1,19.516793,"[-19.51679274732415, -3.341678050971059e-09]",176,https://pubs.lib.uiowa.edu/iowareview/issue/94...
2,Found in traditional village music for centuri...,1,15.593415,"[-15.593415078929139, -1.689919097563871e-07]",303,http://folk-ukraine.com
3,"Soundbank, is now an established digital archi...",1,14.359977,"[-14.359977924772018, -5.801508768720133e-07]",203,https://www.converge.today/article/living-arch...
4,"Soundbank, is now an established digital archi...",1,14.359977,"[-14.359977924772018, -5.801508768720133e-07]",203,https://www.converge.today/article/living-arch...
5,"Go Hogs! 🐗 Play it over, and over, and over. ...",1,13.983457,"[-13.983457643561023, -8.45398924611416e-07]",287,http://ow.ly/wcEa50F7MCy
6,"Tomorrow at noon, join us for “The Digital Arc...",1,13.130243,"[-13.130244868518298, -1.9843009415252016e-06]",303,https://l8r.it/6eX2
7,The Lomax Digital Archive is an online museum ...,1,10.481329,"[-10.481356680674216, -2.8055029052899238e-05]",128,https://archive.culturalequity.org/
8,The July 2021 issue of The Wire is available t...,1,9.176707,"[-9.176810890103697, -0.00010341514020330393]",285,http://exacted.me/TitleShowcase
9,Check out 4 tracks from POLYBIUS by Sinnesloch...,1,9.05368,"[-9.053797173818412, -0.00011695296671168306]",148,https://burningwitchesrecords.com/music


In [33]:
#run w/ LogReg
tweet_predict = lr_model_predict(twitter_set, 'tweet', 'Target', prediction_twitter, 'tweet', 'twitter_test', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [34]:
#filter and display results 
tweet_predict_df = tweet_predict.copy()
tweet_predict_df = tweet_predict.loc[tweet_predict['Prediction'] == 1]
tweet_predict_df = tweet_predict_df[~tweet_predict_df.url.str.contains('|'.join(discard))]
tweet_predict_df = tweet_predict_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_df = tweet_predict_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,Digital Archive of the Guatemalan National Pol...,1,11.512635,"[-11.512644857975797, -1.000285649208476e-05]",105,https://ahpn.lib.utexas.edu/
1,The Digital Archive by ⁦⁦@neueprojects⁩ and ⁦@...,1,11.159183,"[-11.159197306997532, -1.4243780749964956e-05]",264,https://jingculturecommerce.com/the-digital-ar...
2,The Bake and Jairazbhoy Digital Archive of #So...,1,10.282121,"[-10.282155647001614, -3.4239228898364566e-05]",303,https://bit.ly/ucla-bake-jairazbhoy
3,"""Determining the value of a digital archive. T...",1,10.209076,"[-10.20911240745847, -3.6833824511754076e-05]",120,https://researchgate.net/publication/358964775...
4,@Rickybutler20 Our digital archive doesn't go ...,1,9.713478,"[-9.71353842258615, -6.046123197516628e-05]",194,https://www.britishnewspaperarchive.co.uk/
...,...,...,...,...,...,...
875,One of the most important projects in East Vie...,1,0.101115,"[-0.7449822900068299, -0.6438670555943788]",276,https://www.eastview.com/resources/gpa/nowy-dz...
876,@ColumbiaMayor @washcoll GCM also enjoyed Will...,1,0.077706,"[-0.7327548120363487, -0.6550487283039546]",231,https://www.marshallfoundation.org/library/dig...
877,Good brother. He translated to work. https://t...,1,0.057573,"[-0.7223479429552707, -0.6647749656714687]",84,https://siiasi.org/digital-archive/shaykh-muha...
878,""" ‘You’ve had these since 1948, and you’ve kep...",1,0.040378,"[-0.713540004850315, -0.673161925898287]",277,https://www.thej.ca/2022/01/20/a-lost-jewish-h...


In [37]:
#optional filter by kw
substring = 'music'
mask = tweet_predict_df.applymap(lambda x: substring in x.lower() if isinstance(x,str) else False).to_numpy()
tweet_predict_df_kw = tweet_predict_df.loc[mask] 
tweet_predict_df_kw = tweet_predict_df_kw.reset_index(drop=True)
tweet_predict_df_kw

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,The Bake and Jairazbhoy Digital Archive of #So...,1,10.282121,"[-10.282155647001614, -3.4239228898364566e-05]",303,https://bit.ly/ucla-bake-jairazbhoy
1,"From the archive: Charles Simic's ""Little Nigh...",1,8.77946,"[-8.779613840756257, -0.00015384931369369354]",176,https://pubs.lib.uiowa.edu/iowareview/issue/94...
2,Found in traditional village music for centuri...,1,7.036143,"[-7.037022029734388, -0.0008791258477608581]",303,http://folk-ukraine.com
3,"Go Hogs! 🐗 Play it over, and over, and over. ...",1,6.636746,"[-6.638056197489658, -0.0013104285540606053]",287,http://ow.ly/wcEa50F7MCy
4,"Soundbank, is now an established digital archi...",1,6.449305,"[-6.4508849589932105, -0.001580372220729189]",203,https://www.converge.today/article/living-arch...
5,"Soundbank, is now an established digital archi...",1,6.449305,"[-6.4508849589932105, -0.001580372220729189]",203,https://www.converge.today/article/living-arch...
6,"Tomorrow at noon, join us for “The Digital Arc...",1,5.512359,"[-5.516387223150155, -0.0040284496692343285]",303,https://l8r.it/6eX2
7,The July 2021 issue of The Wire is available t...,1,4.304387,"[-4.317805539418202, -0.013418731483126643]",285,http://exacted.me/TitleShowcase
8,Check out 4 tracks from POLYBIUS by Sinnesloch...,1,3.794015,"[-3.816270501059871, -0.02225556123905779]",148,https://burningwitchesrecords.com/music
9,Check out how our digital archive of programmi...,1,3.70041,"[-3.724822620794983, -0.024412964113634358]",167,https://playbill.com/article/watch-and-listen-...


Step 4:
- grab links from twitter predictions and scrape them for text 
- return a new table that can be used to predict relevance of URLs 

In [57]:
#URLs to list 
twitter_link_list_cv = [link for link in tweet_predict_cv_df['url'] if 'twitter' not in link]
twitter_link_list = [link for link in tweet_predict_df['url'] if 'twitter' not in link]

In [None]:
#scrape URL list
links_to_add_cv = scrape_links(twitter_link_list_cv)
links_to_add = scrape_links(twitter_link_list)

In [40]:
#remove empty descriptions 
links_to_add_cv = links_to_add_cv[links_to_add_cv.Description != ''].reset_index(drop=True)
links_to_add = links_to_add[links_to_add.Description != ''].reset_index(drop=True)

Step 5:
- run the predict function on scraped URLs 
- return a DF w/ title, description, url, prediction, confidence score, probability and input length

In [50]:
#run with LogRegCV 
twitter_preds_cv = lr_model_predict_cv(new_training_set, 'Description', 'Target', 10, 'f1', links_to_add_cv, 'Description', 'extended_even_model_cv_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [51]:
#filter results by positive value and score
twitter_preds_cv_df = twitter_preds_cv.copy()
twitter_preds_cv_df = twitter_preds_cv_df.loc[twitter_preds_cv_df['Prediction'] == 1]
twitter_preds_cv_df = twitter_preds_cv_df[~twitter_preds_cv_df.Title.str.contains('|'.join(discard))]
twitter_preds_cv_df = twitter_preds_cv_df[~twitter_preds_cv_df.URL.str.contains('|'.join(discard))]
twitter_preds_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,The Iowa Review |\n \n Issue: Issue: 2(3...,Login | Register \n 2002-10-01 Volume 32 • I...,https://pubs.lib.uiowa.edu/iowareview/issue/94...,1,3.379723,"[-3.4132129760466325, -0.03348978489815084]",6383
1,Home Page - Anne Finch Digital Archive,"Search options by Peter Cross, circa 1690 \r\n...",https://tinyurl.com/Anne-Finch-Digital-Archive,1,1.428529,"[-1.6433670802215536, -0.21483812987527676]",1186
2,Bake/Jairazbhoy Digital Archive of South Asian...,"In 1963-1964, Nazir Ali Jairazbhoy (Founding C...",https://bit.ly/ucla-bake-jairazbhoy,1,1.270726,"[-1.5180761210474638, -0.24735052415585823]",1743
3,Music — burningwitchesrecords,Sign up with your email address to receive new...,https://burningwitchesrecords.com/music,1,0.659642,"[-1.0764008233441063, -0.4167586287398164]",85
4,Reggae sound systems: One Londoner and his dig...,Latest News Around The World A music enth...,https://newsatw.com/reggae-sound-systems-one-l...,1,0.431351,"[-0.9319022630470867, -0.5005517129978212]",924


In [52]:
#run with LogReg 
twitter_preds = lr_model_predict(new_training_set, 'Description', 'Target', links_to_add, 'Description', 'extended_even_model_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [53]:
#filter results by positive value and score
twitter_preds_df = twitter_preds.copy()
twitter_preds_df = twitter_preds_df.loc[twitter_preds_df['Prediction'] == 1]
twitter_preds_df = twitter_preds_df[~twitter_preds_df.Title.str.contains('|'.join(discard))]
twitter_preds_df = twitter_preds_df[~twitter_preds_df.URL.str.contains('|'.join(discard))]
twitter_preds_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,The Iowa Review |\n \n Issue: Issue: 2(3...,Login | Register \n 2002-10-01 Volume 32 • I...,https://pubs.lib.uiowa.edu/iowareview/issue/94...,1,2.955472,"[-3.0062162274996624, -0.050744540304170214]",6383
1,Home Page - Anne Finch Digital Archive,"Search options by Peter Cross, circa 1690 \r\n...",https://tinyurl.com/Anne-Finch-Digital-Archive,1,1.237588,"[-1.4922949267307495, -0.25470650538087586]",1186
2,Bake/Jairazbhoy Digital Archive of South Asian...,"In 1963-1964, Nazir Ali Jairazbhoy (Founding C...",https://bit.ly/ucla-bake-jairazbhoy,1,1.055736,"[-1.3543110842296102, -0.2985745940043804]",1743
3,Music — burningwitchesrecords,Sign up with your email address to receive new...,https://burningwitchesrecords.com/music,1,0.668307,"[-1.0821207257548464, -0.41381401310968813]",85
4,Reggae sound systems: One Londoner and his dig...,Latest News Around The World A music enth...,https://newsatw.com/reggae-sound-systems-one-l...,1,0.33679,"[-0.875654364809399, -0.5388639307863358]",924


**TESTING NOTES**
- Running this pipeline against searches for 'digital archive' and then filtering twitter results by the music keyword shows no real difference btw LogReg and LogRegCV functions at both stages. The end result is a selection of 5 URLs including two digital archives featuring text and still image content (no music data), one archive of the Iowa Review, one news item from the BBC about a fan-led sound system archive in the UK, and one music label website. 