# Extracting most frequent words and most important words 

## Here we will use the dataset clean_data.csv, which we got as an output earlier

In [47]:
import pandas as pd
import numpy as np
from collections import Counter 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#import the clean data file that was given as an output by the program by the earlier program
data = pd.read_csv('clean_data.csv',index_col=0)
print(data.shape)
data.head()

(50, 1)


Unnamed: 0,Text
0,skillsmachine learningalgorithmsartificial int...
1,java developeropen water scuba hackathon winne...
2,skillsdata analysisstatisticspandascertificati...
3,skillsdata analysispublic speakingleadershipad...
4,skillsdata structuresmachine photoshop program...


In [24]:
text = list(data['Text'])

# Finding frequent words

## We will find the k most frequent words, along with ther frequency for each row
### (here k = 10), value can be changed as a parameter to function

In [25]:
def find_k_most_frequent(list_text,k = 10):
    """
    input - list_text (list of strings) text_data
                    k (int) number of frquent words in each row
    output - freq_data (list of text)
            where k most freq words for each row
    """
    freq_data = []
    for text in list_text:
        counter = Counter(text.split())
        freq_words = [x for (x,_) in counter.most_common(k)]
        freq_data.append(", ".join(freq_words))
        
    return freq_data

In [26]:
freq_data = find_k_most_frequent(text)

In [27]:
freq_dataframe = pd.DataFrame(freq_data,columns = ['Frquent words'])
freq_dataframe.head()

Unnamed: 0,Frquent words
0,"research, year, bonn, artificial, computer, vi..."
1,"lab, student, media, winner, online, page, hac..."
2,"engineering, data, skillsdata, analysisstatist..."
3,"skillsdata, analysispublic, speakingleadership..."
4,"b, r, ambedkar, national, institute, skillsdat..."


In [28]:
#lets save this as csv 
freq_dataframe.to_csv('freq_words_file.csv')

## Additional (lets see which word is the most frequent, most frequent word)

In [29]:
most_freq_list = find_k_most_frequent(text,k=1)
most_freq_counter = Counter(most_freq_list)
most_freq_counter.most_common(4)

[('data', 12), ('science', 6), ('engineering', 3), ('learning', 3)]

### Hmm...data science engineering and leanring..That makes sense.....

# Finding k most essential words
### here k = 10

To find the essential words, we will use TF-IDF. <br>
TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document (term frequency), and the inverse document frequency of the word across a set of documents (idf). <br>

To sum it up ->
- tf -> gives more word to a value if word is more common in the document 
- idf -> gives more value to a word if word is less common in the whole of the corpus 
- tf-idf -> gives more value to a word if word is more common in the current document and less common in the corpus

In [62]:
def find_important_words(text_data,k = 10):
    """
    input - text_data (list) list of sentences
                    k  (int) number of important word for each row
    Output - list_imp_words (list)
    
    method used - 
    for each used, the tfidf score for each word is calculated, and we select words 
    which has top k tfidf scores
    """
    list_imp_words = []
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_data)
    for line in text_data:
        response = vectorizer.transform([line])
        feature_array = np.array(vectorizer.get_feature_names())
        tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
        top_k = feature_array[tfidf_sorting][:k]
        list_imp_words.append(', '.join(top_k))
    return list_imp_words    

In [59]:
imp_words = find_important_words(text)
len(imp_words)

50

In [61]:
# convert it to a dataframe
imp_words_data = pd.DataFrame(imp_words,columns = ['Important Words'])
imp_words_data.head()

Unnamed: 0,Important Words
0,"bonn, cad, physical, research, video, artifici..."
1,"lab, winner, media, online, student, mit, mell..."
2,"engineering, dinesh, analysisstatisticspandasc..."
3,"relations, chemical, guwahatipublic, guwahatie..."
4,"ambedkar, national, institute, structuresmachi..."


In [63]:
# download it as csv 
imp_words_data.to_csv('imp_words_file.csv')