**Disclaimer**
- Due to sensitive material being gathered, some information has been altered to keep privacy of students.

# Import libraries and dataframe

In [1]:
import pandas as pd
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /Users/juliettec/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df1 = pd.read_csv('df4.csv')

# Count Vectorizer

## Sentences

### Testing with one comment

In [4]:
testing = nltk.sent_tokenize(df1['Comments'][1])

In [5]:
def count_vectorize(comment, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(comment))
    
    comment_dict = {i:0 for i in unique_words}
    
    for word in comment:
        comment_dict[word] += 1
    
    return comment_dict

test_vectorized = count_vectorize(testing)
print(test_vectorized)

{'No Comment .': 1, "'Demonstrates above average ability, motivation and attitude in this class.": 1, "'Meets acceptable standards of cooperation, work, attitude and effort .": 5, "'Conscientious about classroom responsibilities.": 1}


### All comments

In [6]:
tokenized_comments=[]
for i in df1['Comments']:
    tokenized_comments.append(nltk.sent_tokenize(i))

In [7]:
comment_count = []
for i in tokenized_comments:
    comment_count.append(count_vectorize(i))

In [8]:
comment_count[0:10]

[{"'Meets acceptable standards of cooperation, work, attitude and effort .": 3,
  'No Comment .': 1,
  "'Demonstrates above average ability, motivation and attitude in this class.": 2,
  "'Demonstrates outstanding effort.": 1,
  "'Conscientious about classroom responsibilities.": 1},
 {'No Comment .': 1,
  "'Demonstrates above average ability, motivation and attitude in this class.": 1,
  "'Meets acceptable standards of cooperation, work, attitude and effort .": 5,
  "'Conscientious about classroom responsibilities.": 1},
 {"'Poor test scores.": 1,
  "'Incomplete work.": 3,
  'No Comment .': 2,
  "'Meets acceptable standards of cooperation, work, attitude and effort .": 2},
 {"'Meets acceptable standards of cooperation, work, attitude and effort .": 1,
  'No Comment .': 2,
  "'Demonstrates above average ability, motivation and attitude in this class.": 3,
  "'Demonstrates outstanding effort.": 1,
  "'Conscientious about classroom responsibilities.": 1},
 {'No Comment .': 1,
  "'Incompl

## Words

- This gives us all the unique words that are found in the dataframe
    - downside is that if new comments are loaded, this would not compensate for it.

In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df1['Comments'])
print(vectorizer.get_feature_names())

['ability', 'about', 'above', 'absences', 'absent', 'academic', 'acceptable', 'according', 'accumulated', 'add', 'advantage', 'affecting', 'after', 'all', 'along', 'also', 'always', 'am', 'an', 'and', 'appropriate', 'area', 'areas', 'as', 'ask', 'asking', 'assigned', 'assignments', 'at', 'attendance', 'attention', 'attentive', 'attentiveness', 'attitude', 'austin', 'available', 'average', 'away', 'be', 'because', 'been', 'before', 'behavior', 'benefitting', 'better', 'big', 'bit', 'both', 'bright', 'brought', 'but', 'came', 'can', 'catch', 'cell', 'chatting', 'check', 'chinese', 'class', 'classroom', 'classwork', 'come', 'comes', 'commendable', 'comment', 'complete', 'completed', 'completely', 'conference', 'conferences', 'conscientious', 'content', 'continues', 'cooperation', 'cooperatively', 'could', 'counselor', 'currently', 'daily', 'danger', 'date', 'day', 'deal', 'definitely', 'delight', 'demonstrates', 'developed', 'did', 'directions', 'discussed', 'disruptive', 'do', 'does', 'd

- This gives us the number of times the words show up (column) per student (rows)

In [10]:
word_count = X.toarray() #number of words per student

In [11]:
word_count

array([[2, 1, 2, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

- This gives us the total number of words per row, using numpy.sum()

In [12]:
total_num_words = np.array([np.sum(i) for i in word_count])

- Needed to find the average word use:
    - `word_count.T` because the shape was not the same, had to transpose it to be the same
    - `(word_count.T/total_num_words).T` the whole thing, in order to have it in the shape for all students

In [13]:
# this is the mean number of words
# do something to reduce the dimensions (Top 100 words used) - they are in abc order look 
# get_features
# downfall of it - words that won't be used later
average_word_use = (word_count.T/total_num_words).T

In [14]:
average_word_count = pd.DataFrame(average_word_use, columns=[vectorizer.get_feature_names()])

In [15]:
average_word_count

Unnamed: 0,ability,about,above,absences,absent,academic,acceptable,according,accumulated,add,...,will,with,wonderful,work,working,worth,writing,yet,you,zero
0,0.035714,0.017857,0.035714,0.0,0.0,0.0,0.053571,0.0,0.0,0.0,...,0.0,0.0,0.0,0.053571,0.0,0.0,0.0,0.0,0.0,0.0
1,0.016393,0.016393,0.016393,0.0,0.0,0.0,0.081967,0.0,0.0,0.0,...,0.0,0.0,0.0,0.081967,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.064516,0.0,0.0,0.0,...,0.0,0.0,0.0,0.161290,0.0,0.0,0.0,0.0,0.0,0.0
3,0.060000,0.020000,0.060000,0.0,0.0,0.0,0.020000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.020000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.108696,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
882,0.048387,0.000000,0.048387,0.0,0.0,0.0,0.048387,0.0,0.0,0.0,...,0.0,0.0,0.0,0.048387,0.0,0.0,0.0,0.0,0.0,0.0
883,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.025000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.050000,0.0,0.0,0.0,0.0,0.0,0.0
884,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.051282,0.0,0.0,0.0,...,0.0,0.0,0.0,0.051282,0.0,0.0,0.0,0.0,0.0,0.0


## New df with new word count averages

In [17]:
df5 = pd.concat([df1, average_word_count], axis=1)

In [22]:
df5.to_csv('df5.csv')

# Possible new features

Things to add as features:
- sentiment analysis - load pass comments through - probability make that a feature
- find the percentage of the num