In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
PATH='/content/drive/My Drive/DSI Month 4/Week 13/NLP text sets/AP_News'

DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern=CAT_PATTERN)

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

In [None]:
docs[0]

'HONOLULU (AP) — The University of Hawaii seeks additional funds for student mental health services, scholarships and other items in a new supplementary budget request, officials said. The Board of Regents approved the fiscal year 2020-2021 supplemental operating budget of about $28 million Thursday, The Honolulu Star-Advertiser reported. The request will be submitted to the state Legislature and Democratic Gov. The university requested $2.6 million to hire 19 psychologists for the 10-campus system. The University of Hawaii Manoa has eight psychologists, the Hilo campus has three, and the West Oahu campus has 1.75 positions, while each community college has one position, said Allyson Tanouye, who coordinates mental health throughout the university system. "The national standard is one mental health professional per 1,000 to 1,500 students," Tanouye said. "If we add the 19 positions we will be up to one per 1,500. That\'s how low we are". The mental health funding would also expand prog

In [None]:
# categories = [fileid.split('/')[0] for fileid in corpus.fileids()]
categories = [corpus.categories(fileid) for fileid in corpus.fileids()]

In [None]:
categories

In [None]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    preprocessed = []
    
    for doc in docs:
        tokenized = word_tokenize(doc)

        cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
                   for token in tokenized 
                   if not token.lower() in stopwords.words('english') 
                   if token.isalpha()]

        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
        
    return preprocessed

In [None]:
preprocessed = preprocess(docs)

In [None]:
preprocessed[0]

'honolulu ap univers hawaii seek addit fund student mental health servic scholarship item new supplementari budget request offici said board regent approv fiscal year supplement oper budget million thursday honolulu report request submit state legislatur democrat gov univers request million hire psychologist system univers hawaii manoa eight psychologist hilo campus three west oahu campus posit communiti colleg one posit said allyson tanouy coordin mental health throughout univers system nation standard one mental health profession per student tanouy said add posit one per low mental health fund would also expand program prevent suicid reduc mental health stigma provid peer educ alert new student parent colleg transit challeng offici said largest item supplement budget million expand hawaii promis program scholarship state institut univers propos flat amount cover tuition fee hawaii resid qualifi feder pell grant look focus needi student go campus said donald straney vice presid academ

In [None]:
docs[0]

'HONOLULU (AP) — The University of Hawaii seeks additional funds for student mental health services, scholarships and other items in a new supplementary budget request, officials said. The Board of Regents approved the fiscal year 2020-2021 supplemental operating budget of about $28 million Thursday, The Honolulu Star-Advertiser reported. The request will be submitted to the state Legislature and Democratic Gov. The university requested $2.6 million to hire 19 psychologists for the 10-campus system. The University of Hawaii Manoa has eight psychologists, the Hilo campus has three, and the West Oahu campus has 1.75 positions, while each community college has one position, said Allyson Tanouye, who coordinates mental health throughout the university system. "The national standard is one mental health professional per 1,000 to 1,500 students," Tanouye said. "If we add the 19 positions we will be up to one per 1,500. That\'s how low we are". The mental health funding would also expand prog

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed, categories, test_size=0.2, random_state=42)

In [None]:
model = Pipeline([
                  ('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  # ('tfidf', TfidfVectorizer()),
                  ('clf', LogisticRegression())
                  ])

model.fit(X_train, y_train)
# model.score(X_test, y_test)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
  

In [None]:
# the long way
import pandas as pd

# vectorize the text data
vectorizer = TfidfVectorizer()

# fit transform the train
x_train_dtm = vectorizer.fit_transform(X_train)
# x_train_df = pd.DataFrame(x_train_dtm.toarray(), columns=vectorizer.get_feature_names())
# x_train_df.head()

# transform the test
x_test_dtm = vectorizer.transform(X_test)
# x_test_df = pd.DataFrame(x_test_dtm.toarray(), columns=vectorizer.get_feature_names())
# x_test_df.head()

# fit to your classifier of choice
clf = LogisticRegression()
clf.fit(x_train_dtm, y_train)

clf.score(x_test_dtm, y_test)

In [None]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      health       0.67      0.86      0.75         7
    politics       0.71      0.91      0.80        11
      sports       0.92      0.86      0.89        14
        tech       0.88      0.58      0.70        12

    accuracy                           0.80        44
   macro avg       0.79      0.80      0.78        44
weighted avg       0.82      0.80      0.79        44



In [None]:
scores = cross_val_score(model, preprocessed, categories, cv=10, scoring='f1_macro')

scores

array([0.91428571, 0.90833333, 0.83791209, 0.86378066, 0.95804196,
       0.825     , 0.71932773, 0.71495726, 0.81556638, 0.63053613])

In [None]:
scores.mean()

0.8187741262005968

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.nytimes.com/2020/11/04/technology/california-uber-lyft-prop-22.html'

def get_url_text(url):
    response = requests.get(url)
    content = response.text
    
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
    soup = BeautifulSoup(content, "lxml")
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = ' '.join(text_list)
    return text


text = get_url_text(url)
text

'      State Results   Alabama     Alaska     Arizona     Arkansas     California     Colorado     Connecticut     Delaware     D.C.     Florida     Georgia     Hawaii     Idaho     Illinois     Indiana     Iowa     Kansas     Kentucky     Louisiana     Maine     Maryland     Massachusetts     Michigan     Minnesota     Mississippi     Missouri     Montana     Nebraska     Nevada     New Hampshire     New Jersey     New Mexico     New York     North Carolina     North Dakota     Ohio     Oklahoma     Oregon     Pennsylvania     Rhode Island     South Carolina     South Dakota     Tennessee     Texas     Utah     Vermont     Virginia     Washington     West Virginia     Wisconsin     Wyoming   Disabling auto-updates may improve reliability when using a screen reader or keyboard to navigate. Advertisement Supported by Uber and Lyft Drivers in California Will Remain Contractors The victory of Proposition 22, the most expensive initiative in the state’s history, could help gig companies re

In [None]:
cleaned = preprocess([text])
cleaned

['state result alabama alaska arizona arkansa california colorado connecticut delawar florida georgia hawaii idaho illinoi indiana iowa kansa kentucki louisiana main maryland massachusett michigan minnesota mississippi missouri montana nebraska nevada new hampshir new jersey new mexico new york north carolina north dakota ohio oklahoma oregon pennsylvania rhode island south carolina south dakota tennesse texa utah vermont virginia washington west virginia wisconsin wyom disabl may improv reliabl use screen reader keyboard navig advertis support uber lyft driver california remain contractor victori proposit expens initi state histori could help gig compani remak labor law throughout countri kate conger oakland driver worker gig economi compani california becom employe california voter carri uber lyft victori overwhelm approv proposit ballot measur allow gig economi compani continu treat driver independ contractor uber lyft deliveri servic doordash design measur exempt compani state labo

In [None]:
model.predict(cleaned)[0]

'tech'